billets

Project Kaggle - Marketing Analytics

Librairies
In [14]:
import kagglehub
import os
from skimpy import skim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import scipy.stats as stats
import seaborn as sns
from IPython.display import Image, display
from sklearn.preprocessing import StandardScaler

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE

import warnings
warnings.filterwarnings('ignore')
Functions & Dictionary
In [66]:
from MC_Functions import *

def Column_distribution(DF):
    """
    Displays distributions, boxplots, and identifies outliers for each numeric column in a DataFrame.
    
    Arguments:
    DF: The pandas DataFrame containing the data.

    """
    DF = DF.select_dtypes(include=[np.number])
    for i, column in enumerate(DF.columns, start=0):
        # Chart to visualize outliers
        plt.figure(figsize=(10, 2.5))
        plt.subplot(1, 2, 1) 
        # Graphical visualization
        sns.histplot(DF[column],bins=30, kde=True, color='teal', edgecolor='black', linewidth=1, line_kws={'color': 'red'})
        plt.title(f'Distribution: {column}', fontsize=12, color='firebrick', fontweight='bold')
        # Box plot to detect outliers
        plt.subplot(1, 2, 2)
        # Visualization as a boxplot
        sns.boxplot(x=DF[column], color='teal', 
                    notch=True, whiskerprops={'linewidth': 1}, showmeans=True,
                    meanprops=dict(marker='p', markerfacecolor='white', markeredgecolor='black', markersize=8),
                    medianprops=dict(linestyle='-', linewidth=2, color='firebrick'),
                    flierprops=dict(marker='p', markerfacecolor='firebrick', markersize=5, markeredgecolor='black'),
                    boxprops=dict(linestyle='-', linewidth=1.5),
                    showfliers=True, width=0.4)
                   
        plt.title(f'Boxplot: {column}', fontsize=12, color='firebrick', fontweight='bold')
        # Median & mean value display:
        median_value = np.median(DF[column])
        moy_value = np.mean(DF[column])
        plt.axvline(x=median_value, ymin=0.5, ymax=0, color='firebrick', linestyle=':', linewidth=1)
        plt.axvline(x=moy_value, ymin=0.5, ymax=1, color='blue', linestyle=':', linewidth=1)
        plt.text(moy_value, -0.3, f'Mean: {moy_value:.2f}', color='blue', fontsize=12, ha='left')
        plt.text(median_value, 0.4, f'Median: {median_value:.2f}', color='firebrick', fontsize=12, ha='right')
        plt.tight_layout()
        plt.show()
        # IQR calculation
        Q1 = DF[column].quantile(0.25)
        Q3 = DF[column].quantile(0.75)
        IQR = Q3 - Q1
        # Determination of thresholds for Outliers
        seuil_inf = Q1 - 1.5 * IQR
        seuil_sup = Q3 + 1.5 * IQR
        # Outlier Identification
        outliers = DF[(DF[column] < seuil_inf) | (DF[column] > seuil_sup)]
        # Outlier Display
        Tmess('{} : {} Outliers'.format(column, outliers.shape[0]), Color='black')
        Tmess('---------------------')

📊 About the Dataset¶

🌐 Context¶

This dataset is publicly available on GitHub and can be used for:

  • 🔍 Exploratory Data Analysis (EDA)
  • 📈 Statistical Analysis
  • 📊 Data Visualizations

📁 Content¶

The dataset, ifood_df.csv, contains information on 2,205 customers from the XYZ company. It provides valuable insights into key areas such as:

  • 🧑‍💼 Customer Profiles — Demographic and personal information about customers.
  • 🛒 Product Preferences — Details on customer preferences for specific products.
  • 🚀 Campaign Successes & Failures — Data on the performance of marketing campaigns.
  • 🌐 Channel Performance — Analysis of customer interactions across different channels.
In [69]:
# Dictionary display
display(Image(filename='dictionary.png'))
No description has been provided for this image
Data Cleaning and Preprocessing
Upload and general analysis of the file
In [73]:
# Reading th csv file
data = pd.read_csv('ifood_df.csv')
df=data.copy()
# Display
display(df.head(2))
Tmess('Dataframe dimensions: {} rows and {} columns'.format(df.shape[0],df.shape[1]), Color='blue', Size=12)
Income Kidhome Teenhome Recency MntWines MntFruits MntMeatProducts MntFishProducts MntSweetProducts MntGoldProds ... marital_Together marital_Widow education_2n Cycle education_Basic education_Graduation education_Master education_PhD MntTotal MntRegularProds AcceptedCmpOverall
0 58138.0 0 0 58 635 88 546 172 88 88 ... 0 0 0 0 1 0 0 1529 1441 0
1 46344.0 1 1 38 11 1 6 2 1 6 ... 0 0 0 0 1 0 0 21 15 0

2 rows × 39 columns

Dataframe dimensions: 2205 rows and 39 columns
In [74]:
pd.set_option('display.max_columns', None)
df.describe()
Out[74]:
Income Kidhome Teenhome Recency MntWines MntFruits MntMeatProducts MntFishProducts MntSweetProducts MntGoldProds NumDealsPurchases NumWebPurchases NumCatalogPurchases NumStorePurchases NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 AcceptedCmp1 AcceptedCmp2 Complain Z_CostContact Z_Revenue Response Age Customer_Days marital_Divorced marital_Married marital_Single marital_Together marital_Widow education_2n Cycle education_Basic education_Graduation education_Master education_PhD MntTotal MntRegularProds AcceptedCmpOverall
count 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.0 2205.0 2205.00000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.00000
mean 51622.094785 0.442177 0.506576 49.009070 306.164626 26.403175 165.312018 37.756463 27.128345 44.057143 2.318367 4.100680 2.645351 5.823583 5.336961 0.073923 0.074376 0.073016 0.064399 0.013605 0.009070 3.0 11.0 0.15102 51.095692 2512.718367 0.104308 0.387302 0.216327 0.257596 0.034467 0.089796 0.024490 0.504762 0.165079 0.215873 562.764626 518.707483 0.29932
std 20713.063826 0.537132 0.544380 28.932111 337.493839 39.784484 217.784507 54.824635 41.130468 51.736211 1.886107 2.737424 2.798647 3.241796 2.413535 0.261705 0.262442 0.260222 0.245518 0.115872 0.094827 0.0 0.0 0.35815 11.705801 202.563647 0.305730 0.487244 0.411833 0.437410 0.182467 0.285954 0.154599 0.500091 0.371336 0.411520 575.936911 553.847248 0.68044
min 1730.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.0 11.0 0.00000 24.000000 2159.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 4.000000 -283.000000 0.00000
25% 35196.000000 0.000000 0.000000 24.000000 24.000000 2.000000 16.000000 3.000000 1.000000 9.000000 1.000000 2.000000 0.000000 3.000000 3.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.0 11.0 0.00000 43.000000 2339.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 56.000000 42.000000 0.00000
50% 51287.000000 0.000000 0.000000 49.000000 178.000000 8.000000 68.000000 12.000000 8.000000 25.000000 2.000000 4.000000 2.000000 5.000000 6.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.0 11.0 0.00000 50.000000 2515.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 343.000000 288.000000 0.00000
75% 68281.000000 1.000000 1.000000 74.000000 507.000000 33.000000 232.000000 50.000000 34.000000 56.000000 3.000000 6.000000 4.000000 8.000000 7.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.0 11.0 0.00000 61.000000 2688.000000 0.000000 1.000000 0.000000 1.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 964.000000 884.000000 0.00000
max 113734.000000 2.000000 2.000000 99.000000 1493.000000 199.000000 1725.000000 259.000000 262.000000 321.000000 15.000000 27.000000 28.000000 13.000000 20.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 3.0 11.0 1.00000 80.000000 2858.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 2491.000000 2458.000000 4.00000
In [76]:
skim(df)
╭──────────────────────────────────────────────── skimpy summary ─────────────────────────────────────────────────╮
│          Data Summary                Data Types                                                                 │
│ ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓ ┏━━━━━━━━━━━━━┳━━━━━━━┓                                                          │
│ ┃ dataframe         ┃ Values ┃ ┃ Column Type ┃ Count ┃                                                          │
│ ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩ ┡━━━━━━━━━━━━━╇━━━━━━━┩                                                          │
│ │ Number of rows    │ 2205   │ │ int32       │ 38    │                                                          │
│ │ Number of columns │ 39     │ │ float64     │ 1     │                                                          │
│ └───────────────────┴────────┘ └─────────────┴───────┘                                                          │
│                                                     number                                                      │
│ ┏━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━┳━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┓  │
│ ┃ column_name            ┃ NA  ┃ NA %  ┃ mean     ┃ sd      ┃ p0   ┃ p25   ┃ p50   ┃ p75   ┃ p100   ┃ hist   ┃  │
│ ┡━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━╇━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━┩  │
│ │ Income                 │   0 │     0 │    51620 │   20710 │ 1730 │ 35200 │ 51290 │ 68280 │ 113700 │ ▂▇▇▇▃  │  │
│ │ Kidhome                │   0 │     0 │   0.4422 │  0.5371 │    0 │     0 │     0 │     1 │      2 │  ▇  ▆  │  │
│ │ Teenhome               │   0 │     0 │   0.5066 │  0.5444 │    0 │     0 │     0 │     1 │      2 │  ▇  ▇  │  │
│ │ Recency                │   0 │     0 │    49.01 │   28.93 │    0 │    24 │    49 │    74 │     99 │ ▇▇▇▇▇▇ │  │
│ │ MntWines               │   0 │     0 │    306.2 │   337.5 │    0 │    24 │   178 │   507 │   1493 │ ▇▂▂▁▁  │  │
│ │ MntFruits              │   0 │     0 │     26.4 │   39.78 │    0 │     2 │     8 │    33 │    199 │  ▇▁▁   │  │
│ │ MntMeatProducts        │   0 │     0 │    165.3 │   217.8 │    0 │    16 │    68 │   232 │   1725 │  ▇▁▁   │  │
│ │ MntFishProducts        │   0 │     0 │    37.76 │   54.82 │    0 │     3 │    12 │    50 │    259 │  ▇▁▁   │  │
│ │ MntSweetProducts       │   0 │     0 │    27.13 │   41.13 │    0 │     1 │     8 │    34 │    262 │  ▇▁▁   │  │
│ │ MntGoldProds           │   0 │     0 │    44.06 │   51.74 │    0 │     9 │    25 │    56 │    321 │  ▇▂▁   │  │
│ │ NumDealsPurchases      │   0 │     0 │    2.318 │   1.886 │    0 │     1 │     2 │     3 │     15 │  ▇▃▁   │  │
│ │ NumWebPurchases        │   0 │     0 │    4.101 │   2.737 │    0 │     2 │     4 │     6 │     27 │  ▇▃▁   │  │
│ │ NumCatalogPurchases    │   0 │     0 │    2.645 │   2.799 │    0 │     0 │     2 │     4 │     28 │   ▇▂   │  │
│ │ NumStorePurchases      │   0 │     0 │    5.824 │   3.242 │    0 │     3 │     5 │     8 │     13 │ ▂▇▃▃▂▃ │  │
│ │ NumWebVisitsMonth      │   0 │     0 │    5.337 │   2.414 │    0 │     3 │     6 │     7 │     20 │  ▅▇▇   │  │
│ │ AcceptedCmp3           │   0 │     0 │  0.07392 │  0.2617 │    0 │     0 │     0 │     0 │      1 │ ▇    ▁ │  │
│ │ AcceptedCmp4           │   0 │     0 │  0.07438 │  0.2624 │    0 │     0 │     0 │     0 │      1 │ ▇    ▁ │  │
│ │ AcceptedCmp5           │   0 │     0 │  0.07302 │  0.2602 │    0 │     0 │     0 │     0 │      1 │ ▇    ▁ │  │
│ │ AcceptedCmp1           │   0 │     0 │   0.0644 │  0.2455 │    0 │     0 │     0 │     0 │      1 │ ▇    ▁ │  │
│ │ AcceptedCmp2           │   0 │     0 │  0.01361 │  0.1159 │    0 │     0 │     0 │     0 │      1 │   ▇    │  │
│ │ Complain               │   0 │     0 │  0.00907 │ 0.09483 │    0 │     0 │     0 │     0 │      1 │   ▇    │  │
│ │ Z_CostContact          │   0 │     0 │        3 │       0 │    3 │     3 │     3 │     3 │      3 │     ▇  │  │
│ │ Z_Revenue              │   0 │     0 │       11 │       0 │   11 │    11 │    11 │    11 │     11 │     ▇  │  │
│ │ Response               │   0 │     0 │    0.151 │  0.3581 │    0 │     0 │     0 │     0 │      1 │ ▇    ▁ │  │
│ │ Age                    │   0 │     0 │     51.1 │   11.71 │   24 │    43 │    50 │    61 │     80 │ ▂▅▇▆▅▁ │  │
│ │ Customer_Days          │   0 │     0 │     2513 │   202.6 │ 2159 │  2339 │  2515 │  2688 │   2858 │ ▇▇▇▇▇▇ │  │
│ │ marital_Divorced       │   0 │     0 │   0.1043 │  0.3057 │    0 │     0 │     0 │     0 │      1 │ ▇    ▁ │  │
│ │ marital_Married        │   0 │     0 │   0.3873 │  0.4872 │    0 │     0 │     0 │     1 │      1 │ ▇    ▅ │  │
│ │ marital_Single         │   0 │     0 │   0.2163 │  0.4118 │    0 │     0 │     0 │     0 │      1 │ ▇    ▂ │  │
│ │ marital_Together       │   0 │     0 │   0.2576 │  0.4374 │    0 │     0 │     0 │     1 │      1 │ ▇    ▃ │  │
│ │ marital_Widow          │   0 │     0 │  0.03447 │  0.1825 │    0 │     0 │     0 │     0 │      1 │   ▇    │  │
│ │ education_2n Cycle     │   0 │     0 │   0.0898 │   0.286 │    0 │     0 │     0 │     0 │      1 │ ▇    ▁ │  │
│ │ education_Basic        │   0 │     0 │  0.02449 │  0.1546 │    0 │     0 │     0 │     0 │      1 │   ▇    │  │
│ │ education_Graduation   │   0 │     0 │   0.5048 │  0.5001 │    0 │     0 │     1 │     1 │      1 │ ▇    ▇ │  │
│ │ education_Master       │   0 │     0 │   0.1651 │  0.3713 │    0 │     0 │     0 │     0 │      1 │ ▇    ▂ │  │
│ │ education_PhD          │   0 │     0 │   0.2159 │  0.4115 │    0 │     0 │     0 │     0 │      1 │ ▇    ▂ │  │
│ │ MntTotal               │   0 │     0 │    562.8 │   575.9 │    4 │    56 │   343 │   964 │   2491 │ ▇▂▂▁▁  │  │
│ │ MntRegularProds        │   0 │     0 │    518.7 │   553.8 │ -283 │    42 │   288 │   884 │   2458 │ ▇▅▃▂▁  │  │
│ │ AcceptedCmpOverall     │   0 │     0 │   0.2993 │  0.6804 │    0 │     0 │     0 │     0 │      4 │   ▇▁   │  │
│ └────────────────────────┴─────┴───────┴──────────┴─────────┴──────┴───────┴───────┴───────┴────────┴────────┘  │
╰────────────────────────────────────────────────────── End ──────────────────────────────────────────────────────╯
👉 No missing value
👉 The dataframe only contains numerical variables.
👉 Ouch, no new customers for more than 2150 days! Let's fix it:
- The dataset is a bit outdated, so I will reduce 'Customer_Days' by 2150 days.
👉 Irrelevant variables : 'Z_CostContact', 'Z_Revenue'
In [79]:
# Update 'Customer_Days'
df['Customer_Days'] = df['Customer_Days'] - 2150
print(df['Customer_Days'].describe())
Tmess("Great, last customer acquired only {} days ago 😉".format(df['Customer_Days'].min()), Color='blue', Size=12)
count    2205.000000
mean      362.718367
std       202.563647
min         9.000000
25%       189.000000
50%       365.000000
75%       538.000000
max       708.000000
Name: Customer_Days, dtype: float64
Great, last customer acquired only 9 days ago 😉
In [80]:
# check for presence of irrelevant variables and delete them if necessary
cols_to_remove = ['Z_CostContact', 'Z_Revenue']
df = df.drop(columns=[col for col in cols_to_remove if col in df.columns])

df.describe()
Out[80]:
Income Kidhome Teenhome Recency MntWines MntFruits MntMeatProducts MntFishProducts MntSweetProducts MntGoldProds NumDealsPurchases NumWebPurchases NumCatalogPurchases NumStorePurchases NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 AcceptedCmp1 AcceptedCmp2 Complain Response Age Customer_Days marital_Divorced marital_Married marital_Single marital_Together marital_Widow education_2n Cycle education_Basic education_Graduation education_Master education_PhD MntTotal MntRegularProds AcceptedCmpOverall
count 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.00000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.000000 2205.00000
mean 51622.094785 0.442177 0.506576 49.009070 306.164626 26.403175 165.312018 37.756463 27.128345 44.057143 2.318367 4.100680 2.645351 5.823583 5.336961 0.073923 0.074376 0.073016 0.064399 0.013605 0.009070 0.15102 51.095692 362.718367 0.104308 0.387302 0.216327 0.257596 0.034467 0.089796 0.024490 0.504762 0.165079 0.215873 562.764626 518.707483 0.29932
std 20713.063826 0.537132 0.544380 28.932111 337.493839 39.784484 217.784507 54.824635 41.130468 51.736211 1.886107 2.737424 2.798647 3.241796 2.413535 0.261705 0.262442 0.260222 0.245518 0.115872 0.094827 0.35815 11.705801 202.563647 0.305730 0.487244 0.411833 0.437410 0.182467 0.285954 0.154599 0.500091 0.371336 0.411520 575.936911 553.847248 0.68044
min 1730.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 24.000000 9.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 4.000000 -283.000000 0.00000
25% 35196.000000 0.000000 0.000000 24.000000 24.000000 2.000000 16.000000 3.000000 1.000000 9.000000 1.000000 2.000000 0.000000 3.000000 3.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 43.000000 189.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 56.000000 42.000000 0.00000
50% 51287.000000 0.000000 0.000000 49.000000 178.000000 8.000000 68.000000 12.000000 8.000000 25.000000 2.000000 4.000000 2.000000 5.000000 6.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 50.000000 365.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 343.000000 288.000000 0.00000
75% 68281.000000 1.000000 1.000000 74.000000 507.000000 33.000000 232.000000 50.000000 34.000000 56.000000 3.000000 6.000000 4.000000 8.000000 7.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 61.000000 538.000000 0.000000 1.000000 0.000000 1.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 964.000000 884.000000 0.00000
max 113734.000000 2.000000 2.000000 99.000000 1493.000000 199.000000 1725.000000 259.000000 262.000000 321.000000 15.000000 27.000000 28.000000 13.000000 20.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.00000 80.000000 708.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 2491.000000 2458.000000 4.00000
Preparing the dataframe for analysis.
'MntTotal' & 'MntRegularProds' verification
In [83]:
# -----------------------------------Verification of the amounts indicated

# Total spending
df['Spending'] = df[['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']].sum(axis=1)
# Total regular spending
df['Spending_Regular'] = df[['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts']].sum(axis=1)

# Display
df[['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'MntTotal', 'MntRegularProds', 
    'Spending', 'Spending_Regular']].head(2)
Out[83]:
MntWines MntFruits MntMeatProducts MntFishProducts MntSweetProducts MntGoldProds MntTotal MntRegularProds Spending Spending_Regular
0 635 88 546 172 88 88 1529 1441 1617 1529
1 11 1 6 2 1 6 21 15 27 21
👉 First point to be confirmed with the company:
- Use of 'MntTotal' & 'MntRegularProds' data for the rest of the analysis?
👉 For this analysis, I will use the calculated variables 'Spending' and 'Spending_Regular'.
In [89]:
# Removal of inconsistent variables.
df.drop(['MntTotal', 'MntRegularProds'], axis=1, inplace=True)

# Rename calculated columns.
df.rename(columns={'Spending': 'MntTotal', 'Spending_Regular': 'MntRegularProds'}, inplace=True)
Creation of a single 'Education' column
In [92]:
df.loc[:, df.columns.str.startswith('education')].head(2)
Out[92]:
education_2n Cycle education_Basic education_Graduation education_Master education_PhD
0 0 0 1 0 0
1 0 0 1 0 0
In [94]:
# Create the 'education' column with the level of education
df['Education_level'] = df.apply(
    lambda row: 1 if row['education_Basic'] == 1 else (
        2 if row['education_2n Cycle'] == 1 else (
            3 if row['education_Graduation'] == 1 else (
                4 if row['education_Master'] == 1 else (
                    5 if row['education_PhD'] == 1 else 0)))),
    axis=1)
df.drop(['education_2n Cycle', 'education_Basic', 'education_Graduation', 'education_Master', 'education_PhD'], axis=1, inplace=True)
df.head(2)
Out[94]:
Income Kidhome Teenhome Recency MntWines MntFruits MntMeatProducts MntFishProducts MntSweetProducts MntGoldProds NumDealsPurchases NumWebPurchases NumCatalogPurchases NumStorePurchases NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 AcceptedCmp1 AcceptedCmp2 Complain Response Age Customer_Days marital_Divorced marital_Married marital_Single marital_Together marital_Widow AcceptedCmpOverall MntTotal MntRegularProds Education_level
0 58138.0 0 0 58 635 88 546 172 88 88 3 8 10 4 7 0 0 0 0 0 0 1 63 672 0 0 1 0 0 0 1617 1529 3
1 46344.0 1 1 38 11 1 6 2 1 6 2 1 1 2 5 0 0 0 0 0 0 0 66 122 0 0 1 0 0 0 27 21 3
Creation of 'Marital', 'Adult_household' and 'People_household' columns
In [97]:
df.loc[:, df.columns.str.startswith('marital') | df.columns.str.startswith('Kid') | df.columns.str.startswith('Teen')].head(2)
Out[97]:
Kidhome Teenhome marital_Divorced marital_Married marital_Single marital_Together marital_Widow
0 0 0 0 0 1 0 0
1 1 1 0 0 1 0 0
In [99]:
# Create the 'Adult_household' column with number of adult
df['Adult_household'] = df.apply(
    lambda row: 1 if row['marital_Divorced'] == 1 else (
        2 if row['marital_Married'] == 1 else (
            1 if row['marital_Single'] == 1 else (
                2 if row['marital_Together'] == 1 else (
                    1 if row['marital_Widow'] == 1 else 0)))),
    axis=1)

df['Marital'] = df.apply(
    lambda row: 'Divorced' if row['marital_Divorced'] == 1 else (
        'Married' if row['marital_Married'] == 1 else (
            'Single' if row['marital_Single'] == 1 else (
                'Together' if row['marital_Together'] == 1 else (
                    'Widow' if row['marital_Widow'] == 1 else 0)))),
    axis=1)

# Create the 'People_household' column with adult + children
df['People_household'] = df['Adult_household'] + df['Kidhome'] + df['Teenhome']

# Removal of 'marital' variables
df.drop(['marital_Divorced', 'marital_Married', 'marital_Single', 'marital_Together', 'marital_Widow'], axis=1, inplace=True)
# Display
df[[ 'Teenhome', 'Kidhome', 'Marital', 'Adult_household', 'People_household']].head(2)
Out[99]:
Teenhome Kidhome Marital Adult_household People_household
0 0 0 Single 1 1
1 1 1 Single 1 3
Creation of 'Total_Purchases' columns
In [102]:
df.loc[:, df.columns.str.endswith('Purchases')].head(2)
Out[102]:
NumDealsPurchases NumWebPurchases NumCatalogPurchases NumStorePurchases
0 3 8 10 4
1 2 1 1 2
In [104]:
# Total Purchases
df['Total_Purchases'] = df[['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']].sum(axis=1)
df[['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'Total_Purchases']].head(2)
Out[104]:
NumDealsPurchases NumWebPurchases NumCatalogPurchases NumStorePurchases Total_Purchases
0 3 8 10 4 25
1 2 1 1 2 6
Checking for inconsistencies between 'MntTotal' and 'Total_Purchases'.
In [107]:
df[df['Total_Purchases'] == 0][['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 
                                'MntGoldProds', 'Total_Purchases', 'MntTotal']]
Out[107]:
MntWines MntFruits MntMeatProducts MntFishProducts MntSweetProducts MntGoldProds Total_Purchases MntTotal
961 2 1 1 1 0 1 0 6
1499 2 1 1 0 0 1 0 5
In [109]:
df[df['Total_Purchases'] == 0][['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'Total_Purchases', 'MntTotal']]
Out[109]:
NumDealsPurchases NumWebPurchases NumCatalogPurchases NumStorePurchases Total_Purchases MntTotal
961 0 0 0 0 0 6
1499 0 0 0 0 0 5
👉 There is no amount without purchases, I delete these two raws
In [112]:
# line removal
df = df[df['Total_Purchases'] != 0]
Checking for inconsistencies in the 'MntTotal' / 'Income' ratio
In [115]:
# Verify the inconsistent ratio MntTotal/Income.
df[(df['MntTotal'] / df['Income']) >0.5]
Out[115]:
Income Kidhome Teenhome Recency MntWines MntFruits MntMeatProducts MntFishProducts MntSweetProducts MntGoldProds NumDealsPurchases NumWebPurchases NumCatalogPurchases NumStorePurchases NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 AcceptedCmp1 AcceptedCmp2 Complain Response Age Customer_Days AcceptedCmpOverall MntTotal MntRegularProds Education_level Adult_household Marital People_household Total_Purchases
20 2447.0 1 0 42 1 1 1725 1 1 1 15 0 28 0 1 0 0 0 0 0 0 0 41 548 0 1730 1729 3 2 Married 3 43
In [117]:
# line removal
df = df[(df['MntTotal'] / df['Income']) <0.5]
In [119]:
df_Prepared = df.copy()
# Save df:
df_Prepared.to_csv('df_Prepared.csv', index=False)
Exploratory Data Analysis
Univariate analysis
In [128]:
# Distribution of nonbinary variables:
df_no_binary = df.drop(['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 
                        'AcceptedCmp5', 'AcceptedCmpOverall', 'Complain', 'Response', 
                        'Marital', 'People_household', 'Adult_household', 'Kidhome', 
                        'Teenhome', 'Education_level'], axis=1)
# Visualization
Column_distribution(df_no_binary)

# Distribution of nonbinary variables:
df_binary = df[['AcceptedCmp1','AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmpOverall', 
                   'Complain', 'Response', 'Marital', 'People_household', 'Adult_household', 'Kidhome', 'Teenhome', 'Education_level']]

# Distribution of ordinal variables:
cols_per_row = 4
num_cols = len(df_binary.columns)  # Nombre total de colonnes à afficher
num_rows = -(-num_cols // cols_per_row)  # Calcul du nombre de lignes (division entière arrondie vers le haut)

# Creating the Subchart Grid
fig, axes = plt.subplots(num_rows, cols_per_row, figsize=(20, num_rows * 3))  # Taille ajustée dynamiquement
axes = axes.flatten()  # Convertit les axes en une liste 1D

# Visualization
Tmess("Distribution of binary and ordinal variables", Align='center', Color='Firebrick', Size='16', Weight='Bold')
for i, col in enumerate(df_binary.columns):
    sns.countplot(x=col, data=df, palette='viridis', ax=axes[i])
    axes[i].set_title(f'Distribution of {col}', size=16, color='firebrick')
    axes[i].set_xlabel('')  
    axes[i].set_ylabel('Count')

# Delete the empty columns
for i in range(num_cols, len(axes)):
    fig.delaxes(axes[i])
plt.tight_layout()
plt.show()
No description has been provided for this image
Income : 0 Outliers
---------------------
No description has been provided for this image
Recency : 0 Outliers
---------------------
No description has been provided for this image
MntWines : 34 Outliers
---------------------
No description has been provided for this image
MntFruits : 245 Outliers
---------------------
No description has been provided for this image
MntMeatProducts : 169 Outliers
---------------------
No description has been provided for this image
MntFishProducts : 222 Outliers
---------------------
No description has been provided for this image
MntSweetProducts : 238 Outliers
---------------------
No description has been provided for this image
MntGoldProds : 201 Outliers
---------------------
No description has been provided for this image
NumDealsPurchases : 81 Outliers
---------------------
No description has been provided for this image
NumWebPurchases : 3 Outliers
---------------------
No description has been provided for this image
NumCatalogPurchases : 19 Outliers
---------------------
No description has been provided for this image
NumStorePurchases : 0 Outliers
---------------------
No description has been provided for this image
NumWebVisitsMonth : 6 Outliers
---------------------
No description has been provided for this image
Age : 0 Outliers
---------------------
No description has been provided for this image
Customer_Days : 0 Outliers
---------------------
No description has been provided for this image
MntTotal : 3 Outliers
---------------------
No description has been provided for this image
MntRegularProds : 3 Outliers
---------------------
No description has been provided for this image
Total_Purchases : 0 Outliers
---------------------
Distribution of binary and ordinal variables
No description has been provided for this image
Bivariate & multivariate analysis
👉 Income vs Total spent
In [131]:
# Convert the 'icefire' palette to a hexadecimal list
icefire_palette = sns.color_palette('icefire', as_cmap=False).as_hex()

Tmess("Interactive Bubble Chart: Income vs MntTotal\nsize = 'People_household'", Color='firebrick', Align='center', Size=15, Weight='bold')
# Creating the interactive chart
fig = px.scatter(
    df, x='Income', y='MntTotal', color='Marital', size='People_household',       
    hover_data=[df.index, 'Income', 'MntTotal', 'Marital', 'People_household', 'Total_Purchases'],  
    labels={'Income': 'Income', 'MntTotal': 'Total Spent'}, color_discrete_sequence=icefire_palette )

# Updated Figure Size
fig.update_layout(
    title_font=dict(size=5, color='firebrick', family='Arial'),
    width=1100, height=400,
    legend=dict(title='Marital', x=1, y=1),
    plot_bgcolor='white',
    xaxis=dict(tickfont=dict(color='black'), linecolor='black', gridcolor='gray'),
    yaxis=dict(tickfont=dict(color='black'), linecolor='black', gridcolor='gray' )
    )
fig.show()

Tmess("With the regression lines", Color='firebrick', Align='center', Size=15, Weight='bold')

sns.lmplot(data=df, x='Income', y='MntTotal', hue='Marital', palette='icefire', height=4, aspect=2.4, legend=False)
# Visualization
plt.xlabel('Income')
plt.ylabel('Total Spent (MntTotal)')
plt.legend(title='Marital', bbox_to_anchor=(1, 1), loc='upper left')
plt.grid(True)
plt.show()
Interactive Bubble Chart: Income vs MntTotal
size = 'People_household'
With the regression lines
No description has been provided for this image
👉 Marital Status & family
In [133]:
# Creation and preparation of the dataframe for analysis
df_child = df.copy()
df_child['Children'] = df['Kidhome'] + df['Teenhome'] 

# Add 'SingleChildren' column
df_child['SingleChildren'] = np.where(df_child['Adult_household'] == 1, df_child['Kidhome'] + df_child['Teenhome'], 0)

# Add 'CoupleChildren' column
df_child['CoupleChildren'] = np.where(df_child['Adult_household'] == 2, df_child['Kidhome'] + df_child['Teenhome'], 0)

# Save df:
df_child.to_csv('df_child.csv', index=False)

df_child.head(2)
Out[133]:
Income Kidhome Teenhome Recency MntWines MntFruits MntMeatProducts MntFishProducts MntSweetProducts MntGoldProds NumDealsPurchases NumWebPurchases NumCatalogPurchases NumStorePurchases NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 AcceptedCmp1 AcceptedCmp2 Complain Response Age Customer_Days AcceptedCmpOverall MntTotal MntRegularProds Education_level Adult_household Marital People_household Total_Purchases Children SingleChildren CoupleChildren
0 58138.0 0 0 58 635 88 546 172 88 88 3 8 10 4 7 0 0 0 0 0 0 1 63 672 0 1617 1529 3 1 Single 1 25 0 0 0
1 46344.0 1 1 38 11 1 6 2 1 6 2 1 1 2 5 0 0 0 0 0 0 0 66 122 0 27 21 3 1 Single 3 6 2 2 0
In [134]:
# --- Creation of the pivot table (sum of MntTotal for each Marital and People_household). ---
# Expenses
Df_Marital_expenses = df_child.pivot_table(values='MntTotal', index='Marital', columns='People_household', aggfunc='sum', fill_value=0)
Df_Marital_exp = Df_Marital_expenses.reset_index()
Df_Marital_exp.columns.name = None
Df_Marital_exp = Df_Marital_exp.melt(id_vars='Marital', var_name='People_household', value_name='Count')
# Count
Df_Marital_count = pd.crosstab(df_child['Marital'], df_child['People_household'])

display(Df_Marital_expenses, Df_Marital_count)
People_household 1 2 3 4 5
Marital
Divorced 58544 69579 13000 466 0
Married 0 237934 216730 42411 4747
Single 190930 82933 15605 3039 0
Together 0 177358 136336 26841 4523
Widow 29536 21111 4678 0 0
People_household 1 2 3 4 5
Marital
Divorced 56 119 49 5 0
Married 0 227 443 167 16
Single 168 221 73 14 0
Together 0 150 294 109 15
Widow 26 33 17 0 0
In [135]:
# ------------------------------------------- Marital Status & family analysis graphics

# -------------------------------COUNT Graphs
Tmess('Total customers by marital status and household size', Color='firebrick', Align='center', Size=15, Weight='bold')
# Barplot
plt.figure(figsize=(15, 4))
plt.subplot(1, 2, 1) 
sns.countplot(data=df_child, x='Marital', hue='People_household', palette='icefire', order=sorted(df['Marital'].unique()))
plt.ylabel('Count')
plt.xticks(rotation=30)
plt.grid(True, linestyle='-', linewidth=0.7, zorder=0)  # Grille en arrière-plan
plt.gca().set_axisbelow(True)

# Cumulative Barplot
plt.subplot(1, 2, 2) 
Df_Marital_count.plot(kind='bar', stacked=True, colormap='icefire', ax=plt.gca())  
plt.ylabel('Count')
plt.xticks(rotation=30)
plt.grid(True, linestyle='-', linewidth=0.7, zorder=0)  # Grille en arrière-plan
plt.gca().set_axisbelow(True)
plt.tight_layout()
plt.show()

# Comments with calculated percentages
One_child = df_child[(df_child['Children'] == 1)]['Marital'].count()
Couple_Customer = df_child[(df_child['Marital'] == 'Married') | (df_child['Marital'] == 'Together')]['Marital'].count()
Single_Customer = df_child[(df_child['Marital'] != 'Married') & (df_child['Marital'] != 'Together')]['Marital'].count()
Tmess('Comments', Size=14, Color='firebrick', Weight='bold')
Tmess('👉 Married, Together represent {} % of the customer base'.format(round(Couple_Customer *100/ len(df_child),2)), Size=12, Color='blue')
Tmess('👉 The different types of single (divorced/widow) represent {} %.'.format(round(Single_Customer*100/ len(df_child),2)), Size=12, Color='blue')
Tmess('👉 Households with 1 child represent the majority of the clientele {} %'.format(round(One_child *100/ len(df_child),2)), Size=12, Color='blue')


# ---------------------------------------EXPENSES Graphs
Tmess('Total expenses by marital status and household size', Color='firebrick', Align='center', Size=15, Weight='bold')
# Creating the grid
plt.figure(figsize=(15, 4))

# Barplot
plt.subplot(1, 2, 1) 
sns.barplot(data=Df_Marital_exp, x='Marital', y='Count', hue='People_household', palette='icefire')
plt.ylabel('Count')
plt.xticks(rotation=30)
plt.grid(True, linestyle='-', linewidth=0.7, zorder=0)  # Grille en arrière-plan
plt.gca().set_axisbelow(True)


# Cumulative barplot
plt.subplot(1, 2, 2) 
Df_Marital_expenses.plot(kind='bar', stacked=True, colormap='icefire', ax=plt.gca())
plt.ylabel('Count')
plt.xticks(rotation=30)
plt.grid(True, linestyle='-', linewidth=0.7, zorder=0)  # Grille en arrière-plan
plt.gca().set_axisbelow(True)

plt.tight_layout()
plt.show()

# Comments with calculated percentages
Total = Df_Marital_exp['Count'].sum()
sing = df_child.loc[(df_child['Adult_household'] == 1) & (df_child['Children'] == 0), 'MntTotal'].sum()
coup = df_child.loc[(df_child['Adult_household'] == 2) & (df_child['Children'] == 0), 'MntTotal'].sum()
Child1 = df_child.loc[(df_child['Adult_household'] == 1) & (df_child['Children'] == 1) , 'MntTotal'].sum()
Child2 = df_child.loc[(df_child['Adult_household'] == 2) & (df_child['Children'] == 1) , 'MntTotal'].sum()
Child3 = df_child.loc[df_child['Children'] > 1 , 'MntTotal'].sum()
tcoup = df_child.loc[df_child['Adult_household'] == 2, 'MntTotal'].sum()

Tmess('Comments', Size=14, Color='firebrick', Weight='bold')
Tmess("👉 Couples (married or not) spent on their own {} € - {} %".format(tcoup, round(tcoup *100/ Total,2)), Size=12, Color='blue')

Tmess("👉 Households without children spent {} €, with {} % for couples and {} % for the others.".format(sing+coup, 
    round(coup *100/ Total,2),round(sing *100/ Total,2)), Size=12, Color='blue')

Tmess("👉 Households with 1 child spent €{}, with {}% for couples and {}% for the others".format(Child1 + Child2, 
    round(Child2 *100/ Total,2), round(Child1 *100/ Total,2)), Size=12, Color='blue')
Total customers by marital status and household size
No description has been provided for this image
Comments
👉 Married, Together represent 64.53 % of the customer base
👉 The different types of single (divorced/widow) represent 35.47 %.
👉 Households with 1 child represent the majority of the clientele 50.41 %
Total expenses by marital status and household size
No description has been provided for this image
Comments
👉 Couples (married or not) spent on their own 846880 € - 63.37 %
👉 Households without children spent 694302 €, with 31.08 % for couples and 20.88 % for the others.
👉 Households with 1 child spent €526689, with 26.42% for couples and 12.99% for the others
👉 Household & children
In [137]:
# Counting the number of households by number of children
df_counts = df_child['Children'].value_counts(normalize=True).reset_index() 
df_counts.columns = ['Children', 'Percentage']
df_counts['Percentage'] = round(df_counts['Percentage'] * 100, 2) 
df_counts = df_counts.sort_values(by='Children').reset_index() 
df_counts.drop('index', axis=1, inplace=True)

# Visualization
Tmess('Distribution of households by number of children (%)', Color='firebrick', Align='center', Size=15, Weight='bold')
plt.figure(figsize=(8, 3))
sns.barplot(x='Children', y='Percentage', data=df_counts, palette='icefire')
plt.xlabel('Number of children.', fontsize=12)
plt.ylabel('Pourcentage (%)', fontsize=12)
plt.ylim(0, 60)  # Ajuster la limite de l'axe Y à 100 pour les pourcentages

# Display the %.
for i, row in df_counts.iterrows():
    plt.text(i, row['Percentage'] + 1, f"{row['Percentage']}%", ha='center', color='firebrick', fontweight='bold')
    
plt.show()
Distribution of households by number of children (%)
No description has been provided for this image
In [138]:
# ----------------------------------------- Spending Distribution by Household Composition

plt.figure(figsize=(15, 4))
Tmess('Distribution of expenses based on household size according to marital status.', Color='firebrick', Align='center', Size=15, Weight='bold')

# Subplot: Expenses for single people with children
plt.subplot(1, 2, 1)
sns.boxplot(x='SingleChildren', y='MntTotal', data=df_child, palette='icefire')
plt.title('Single Person Expenses')
plt.xlabel('Number of Children')

# Subplot: Expenses for couples with children
plt.subplot(1, 2, 2)
sns.boxplot(x='CoupleChildren', y='MntTotal', data=df_child, palette='icefire')
plt.title('Couple Expenses')
plt.xlabel('Number of Children')

plt.tight_layout()
plt.show()

# Comments
Tmess('Comments', Size=14, Color='firebrick', Weight='bold')
Tmess("👉 From two children, there is a greater financial ease for the 'Married' category", Size=12, Color='blue')

# ----------------------------------------- Spending Distribution by Household Size
plt.figure(figsize=(15, 4))
Tmess('Spending Distribution based on household size', Color='firebrick', Align='center', Size=15, Weight='bold')

# List of variables for subplots
variables = [
    ('Adult_household', 'Number of Adults', 'Spending Distribution by Number of Adults without child'),
    ('Children', 'Number of Children', 'Spending Distribution by Number of Children'),
    ('People_household', 'Number of People', 'Spending Distribution by Total Number of Persons')]

# Automatic generation of subplots
for i, (col, xlabel, title) in enumerate(variables, start=1):
    plt.subplot(1, 3, i)
    sns.boxplot(x=col, y='MntTotal', data=df if col == 'People_household' else df_child, palette='icefire')
    plt.title(title)
    plt.xlabel(xlabel)

plt.tight_layout()
plt.show()

# Comments
Tmess('Comments', Size=14, Color='firebrick', Weight='bold')
Tmess("👉 Households without children or with a limited number of adults tend to have higher average spending.", Size=12, Color='blue')
Tmess("👉 Budget constraints clearly increase with household size (adults + children).", Size=12, Color='blue')
Tmess("🎯 These observations can guide targeted marketing strategies based on household composition. For instance, child-free households might be more receptive to premium offers, while larger families might prefer budget-friendly options.", Size=12, Color='blue')

# List of variables for subplots
variables = [
    ('Adult_household', 'Number of Adults', 'Purchase Distribution by Number of Adults'),
    ('Children', 'Number of Children', 'Purchase Distribution by Number of Children'),
    ('People_household', 'Number of People', 'Purchase Distribution by Total Number of Persons')]
Tmess('Purchase Distribution based on household size', Color='firebrick', Align='center', Size=15, Weight='bold')
plt.figure(figsize=(15, 4))
# Automatic generation of subplots
for i, (col, xlabel, title) in enumerate(variables, start=1):
    plt.subplot(1, 3, i)
    sns.boxplot(x=col, y='Total_Purchases', data=df if col == 'People_household' else df_child, palette='icefire')
    plt.title(title)
    plt.xlabel(xlabel)

plt.tight_layout()
plt.show()
Distribution of expenses based on household size according to marital status.
No description has been provided for this image
Comments
👉 From two children, there is a greater financial ease for the 'Married' category
Spending Distribution based on household size
No description has been provided for this image
Comments
👉 Households without children or with a limited number of adults tend to have higher average spending.
👉 Budget constraints clearly increase with household size (adults + children).
🎯 These observations can guide targeted marketing strategies based on household composition. For instance, child-free households might be more receptive to premium offers, while larger families might prefer budget-friendly options.
Purchase Distribution based on household size
No description has been provided for this image
👉 Expenses Distribution by Household Factors
In [140]:
# Lists of variables and columns
columns = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
variables = [('People_household', 'Number of People', 'Expenses Distribution by Total Number of Persons'), 
             ('Adult_household', 'Number of Adults', 'Expenses Distribution by Number of Adults'), 
             ('Children', 'Number of Children', 'Expenses Distribution by Number of Children')]

# Visualization
plt.figure(figsize=(18, 18))  

# Automatic generation of subplots 
total_plots = len(columns) * len(variables) 

for i, col in enumerate(columns):
    for j, (x_col, xlabel, title) in enumerate(variables):
        plt_index = i * len(variables) + j + 1  # Subplot position
        plt.subplot(len(columns), len(variables), plt_index)  
        sns.boxplot(x=x_col, y=col, data=df_child, palette='icefire') 
        plt.title(title if i == 0 or i == 3 else "", fontsize=16, color='firebrick') 
        plt.xlabel(xlabel) #  plt.xlabel(xlabel if i == len(columns) - 1 else "")  
        plt.ylabel(col if j == 0 else "", size = 15)  

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()
No description has been provided for this image
👉 Purchase Distribution by Household Factors
In [142]:
# Lists of variables and columns
columns = ['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth']
variables = [('People_household', 'Number of People', 'Purchase Distribution by Total Number of Persons'), 
             ('Adult_household', 'Number of Adults', 'Purchase Distribution by Number of Adults'), 
             ('Children', 'Number of Children', 'Purchase Distribution by Number of Children')]

# Visualization
plt.figure(figsize=(18, 18))

# Automatic generation of subplots 
total_plots = len(columns) * len(variables)  

for i, col in enumerate(columns):
    for j, (x_col, xlabel, title) in enumerate(variables):
        plt_index = i * len(variables) + j + 1  
        plt.subplot(len(columns), len(variables), plt_index) 
        sns.boxplot(x=x_col, y=col, data=df_child, palette='icefire')
        plt.title(title if i == 0 or i == 3 else "", fontsize=16, color='firebrick') 
        plt.xlabel(xlabel if i == len(columns) - 1 else "")  
        plt.ylabel(col if j == 0 else "", size = 15) 

plt.tight_layout(rect=[0, 0.03, 1, 0.95]) 
plt.show()
No description has been provided for this image
Correlation of variables

📘 Explanation of Correlation Method Choice¶

We observed that some variables have asymmetric distributions.
Additionally, there are several binary or ordinal variables in the dataset.

➡️ Pearson is not suitable in this context.
➡️ Spearman or Kendall are better suited for our DataFrame.

💡 I chose Kendall because it is more robust and our DataFrame is relatively small.

In [145]:
# Correlation Matrix Calculation with kendall method
matrix = df.select_dtypes(include=np.number).corr(method = 'kendall')
# plot clustered heatmap of correlations
sns.clustermap(matrix, cbar_pos=(-0.05, 0.15, 0.03, 0.7), cmap='coolwarm', center=0, figsize=(12, 8))

# Creating a mask to show only the top triangle
mask = np.triu(np.ones_like(matrix, dtype=bool))

# Heatmap display with a mask to see only the top triangle
message = "Correlation Matrix with Kendall"
Tmess(message, Align='center', Color='Firebrick', Size='16', Weight='Bold')
plt.figure(figsize=(20, 10)) 
sns.heatmap(matrix, mask=mask, annot=True, cmap='coolwarm', fmt=".2f", cbar_kws={"shrink": .8})
plt.show()
Correlation Matrix with Kendall
No description has been provided for this image
No description has been provided for this image
Customers segmentation
Data standardization
In [148]:
# Select relevant variables for clustering
df_clust = df_child.drop('Marital',axis=1)
features_for_clustering = ['Income', 
                           'MntTotal', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts','MntGoldProds', 
                           'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'Total_Purchases',
                           'Education_level', 'Adult_household', 'People_household']
In [150]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
# Reducing the dataset
#df_sample = df_encod.sample(n=10000, random_state=42)
X = df_clust[features_for_clustering]

# Data standardization

scaler = MinMaxScaler()
# Transformation logarithmique (log(1 + x))
#scaler = FunctionTransformer(np.log1p, validate=True)  # Utilisation de log(1 + x) pour gérer les zéros
from sklearn.preprocessing import PowerTransformer
#scaler = PowerTransformer(method='yeo-johnson')  # Ou 'box-cox'

scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)
Elbow Method & Silhouette Score
In [160]:
# Calculation of the elbow method and the silhouette score
inertias = []
silhouette_scores = []
k_values = range(2, 12)  

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_scaled, kmeans.labels_))

#  Visualize the curve of the elbow / Silhouette score
plt.figure(figsize=(10, 3))

plt.subplot(1, 2, 1)
plt.plot(k_values, inertias, 'bx-')
plt.xlabel('k')
plt.ylabel('Inertia')
plt.title('Elbow Method', color='firebrick')

plt.subplot(1, 2, 2)
plt.plot(k_values, silhouette_scores, 'rx-')
plt.xlabel('k')
plt.ylabel('Silhouette score')
plt.title('Silhouette score', color='firebrick')

plt.tight_layout()
plt.show()

# Displaying high scores
optimal_k_inertia = k_values[inertias.index(min(inertias))]
optimal_k_silhouette = k_values[silhouette_scores.index(max(silhouette_scores))]

Tmess("Optimal number of clusters according to these curves: {}".format(optimal_k_silhouette), Color="blue", Size = 12)
No description has been provided for this image
Optimal number of clusters according to these curves: 2
In order to get more personalized culsters, I'm going to take k = 8
K-mean / Visualization with PCA & t-NSE
In [170]:
k=8
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(X_scaled)
df_clust['Cluster'] = clusters
    
Tmess(f"\nCluster analysis with k={k}", Color="blue", Size = 12)
df_clust.groupby('Cluster').mean()
Cluster analysis with k=8
Out[170]:
Income Kidhome Teenhome Recency MntWines MntFruits MntMeatProducts MntFishProducts MntSweetProducts MntGoldProds NumDealsPurchases NumWebPurchases NumCatalogPurchases NumStorePurchases NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 AcceptedCmp1 AcceptedCmp2 Complain Response Age Customer_Days AcceptedCmpOverall MntTotal MntRegularProds Education_level Adult_household People_household Total_Purchases Children SingleChildren CoupleChildren
Cluster
0 81070.678322 0.013986 0.034965 47.678322 793.069930 89.797203 714.132867 110.090909 81.615385 76.594406 1.000000 5.048951 6.678322 8.174825 2.615385 0.118881 0.146853 0.398601 0.314685 0.013986 0.000000 0.454545 50.447552 406.237762 0.993007 1865.300699 1788.706294 3.580420 1.657343 1.706294 20.902098 0.048951 0.013986 0.034965
1 34554.350000 0.772222 0.466667 49.097222 41.811111 5.163889 23.305556 6.725000 5.050000 16.083333 2.152778 2.127778 0.580556 3.183333 6.438889 0.080556 0.008333 0.000000 0.002778 0.000000 0.011111 0.130556 48.680556 331.922222 0.091667 98.138889 82.055556 3.352778 1.000000 2.238889 8.044444 1.238889 1.238889 0.000000
2 75207.589189 0.016216 0.113514 53.767568 527.232432 38.194595 444.459459 98.956757 51.610811 77.783784 1.216216 4.329730 6.064865 8.113514 2.464865 0.048649 0.070270 0.151351 0.183784 0.027027 0.005405 0.286486 54.000000 357.367568 0.481081 1238.237838 1160.454054 3.400000 1.378378 1.508108 19.724324 0.129730 0.113514 0.016216
3 70806.629213 0.095506 0.477528 48.578652 433.056180 103.500000 281.775281 120.764045 101.825843 97.775281 2.117978 6.140449 4.893258 8.786517 3.657303 0.056180 0.050562 0.117978 0.101124 0.011236 0.016854 0.089888 50.213483 379.028090 0.337079 1138.696629 1040.921348 2.971910 1.724719 2.297753 21.938202 0.573034 0.162921 0.410112
4 68811.727273 0.142857 0.718615 48.173160 780.515152 27.506494 239.047619 39.437229 30.917749 59.255411 2.909091 6.645022 5.025974 9.415584 4.896104 0.112554 0.225108 0.194805 0.129870 0.064935 0.004329 0.212121 54.852814 408.653680 0.727273 1176.679654 1117.424242 4.004329 1.961039 2.822511 23.995671 0.861472 0.038961 0.822511
5 57645.307292 0.182292 0.760417 45.416667 461.791667 23.916667 138.520833 29.166667 21.515625 61.640625 3.218750 6.671875 3.109375 7.760417 5.781250 0.078125 0.166667 0.041667 0.026042 0.020833 0.005208 0.182292 54.380208 416.854167 0.333333 736.552083 674.911458 3.723958 1.005208 1.947917 20.760417 0.942708 0.942708 0.000000
6 52553.410256 0.400641 0.807692 49.483974 271.307692 14.762821 87.108974 22.426282 15.096154 48.916667 3.673077 5.525641 2.028846 6.224359 5.759615 0.051282 0.086538 0.006410 0.028846 0.000000 0.009615 0.089744 53.448718 375.750000 0.173077 459.618590 410.701923 3.621795 1.974359 3.182692 17.451923 1.208333 0.064103 1.144231
7 33121.763727 0.798669 0.455907 49.206323 30.605657 4.118136 18.136439 5.951747 4.445923 13.013311 1.898502 1.863561 0.427621 3.084859 6.492512 0.068220 0.011647 0.000000 0.000000 0.003328 0.011647 0.066556 48.366057 326.033278 0.083195 76.271215 63.257903 3.276206 2.000000 3.254576 7.274542 1.254576 0.000000 1.254576
In [172]:
pca = PCA().fit(X_scaled)  # Pas besoin de fixer n_components au début
explained_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
Tmess("Choice of the number of components", Align='center', Color='Firebrick', Size='16', Weight='Bold')
plt.figure(figsize=(10, 3))
plt.plot(explained_variance_ratio)
plt.axhline(y=0.90, color='red', linestyle='--', label='90% Variance')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.grid(True)
plt.show()
Choice of the number of components
No description has been provided for this image
In [174]:
cmap = 'icefire'

# Réduction de dimensionnalité avec PCA
pca = PCA(n_components=7)
X_pca = pca.fit_transform(X_scaled)

# Réduction de dimensionnalité avec t-SNE
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_scaled)

# Calculer les centroïdes des clusters
centroids_pca = np.array([X_pca[clusters == i].mean(axis=0) for i in np.unique(clusters)])
centroids_tsne = np.array([X_tsne[clusters == i].mean(axis=0) for i in np.unique(clusters)])

# Créer la subplot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

# Plot PCA
scatter1 = ax1.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap=cmap, s=10)
ax1.scatter(centroids_pca[:, 0], centroids_pca[:, 1], c='red', marker='x', s=150, linewidths=5, label='Centroids')
ax1.set_xlabel('PCA Component 1')
ax1.set_ylabel('PCA Component 2')
ax1.set_title('Clusters visualized in 2D with PCA', color='firebrick', size=20)
ax1.legend(*scatter1.legend_elements(), title="Clusters", 
          bbox_to_anchor=(1.05, 1), loc='upper left')



# Plot t-SNE
scatter2 = ax2.scatter(X_tsne[:, 0], X_tsne[:, 1], c=clusters, cmap=cmap, s=10)
ax2.scatter(centroids_tsne[:, 0], centroids_tsne[:, 1], c='red', marker='x', s=150, linewidths=5, label='Centroids')
ax2.set_xlabel('t-SNE Component 1')
ax2.set_ylabel('t-SNE Component 2')
ax2.set_title('Clusters visualized in 2D with t-SNE', color='firebrick', size=20)
ax2.legend()

plt.tight_layout()
plt.show()
No description has been provided for this image
Average expenses by product & Accepted Campaigns by Cluster
In [177]:
# Selection des colonnes
montant = ['Cluster', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']

# Calcul des moyennes
df_clust_mean = df_clust.groupby('Cluster').mean().reset_index()

# Adaptation du df pour visualisation
melted_df = pd.melt(df_clust_mean[montant], id_vars="Cluster", var_name="Product", value_name="Consumption")

# Visualisation 
Tmess("Average expenses by product", Align='center', Color='Firebrick', Size='16', Weight='Bold')
plt.figure(figsize=(12, 4))
sns.barplot(x="Cluster", y="Consumption", hue="Product", data=melted_df, ci=None, palette="icefire")
plt.xlabel("Cluster")
plt.ylabel("Average expenses") 
plt.legend(title="Product", loc="lower left", bbox_to_anchor=(1, 0.5))
plt.show()
Average expenses by product
No description has been provided for this image
In [179]:
# Selection des colonnes
Campaign = ['Cluster', 'AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Response']

# Adaptation du df pour visualisation
melted_df = pd.melt(df_clust_mean[Campaign], id_vars="Cluster", var_name="Product", value_name="Consumption")

# Visualisation 
Tmess("Average of Accepted Campaigns", Align='center', Color='Firebrick', Size='16', Weight='Bold')
plt.figure(figsize=(12, 4))
sns.barplot(x="Cluster", y="Consumption", hue="Product", data=melted_df, ci=None, palette="icefire")
plt.xlabel("Cluster")
plt.ylabel("Average Campaign Acceptance") 
plt.legend(title="Product", loc="lower left", bbox_to_anchor=(1, 0.5))
plt.show()
Average of Accepted Campaigns
No description has been provided for this image
In [180]:
# Analyze cluster characteristics
cluster_df = pd.DataFrame(X_scaled, columns=features_for_clustering)
cluster_df['Cluster'] = clusters

# Cluster Averages
cluster_means = cluster_df.groupby('Cluster').mean()
cluster_means
Out[180]:
Income MntTotal MntWines MntFruits MntMeatProducts MntFishProducts MntSweetProducts MntGoldProds NumWebPurchases NumCatalogPurchases NumStorePurchases Total_Purchases Education_level Adult_household People_household
Cluster
0 1.423162 2.093221 1.441625 1.592286 2.551984 1.318270 1.323631 0.627785 0.344740 1.471926 0.724109 0.793123 0.121286 0.025125 -0.980486
1 -0.829838 -0.846174 -0.784593 -0.534634 -0.657052 -0.566854 -0.537587 -0.541845 -0.723535 -0.748579 -0.818319 -0.902717 -0.105496 -1.348874 -0.392530
2 1.139185 1.050201 0.653865 0.295460 1.299295 1.115212 0.594254 0.650774 0.081721 1.248536 0.705163 0.637782 -0.058452 -0.557975 -1.199272
3 0.926027 0.884630 0.374791 1.936651 0.543593 1.512920 1.814925 1.037193 0.743902 0.821895 0.913128 0.929778 -0.484924 0.165958 -0.327548
4 0.829404 0.947808 1.404422 0.026858 0.345114 0.029731 0.091229 0.292636 0.928425 0.870223 1.107517 1.201145 0.543593 0.659921 0.251756
5 0.288563 0.215726 0.459943 -0.063358 -0.121854 -0.157577 -0.137326 0.338740 0.938245 0.172292 0.596052 0.774436 0.264282 -1.337988 -0.713748
6 0.041939 -0.244909 -0.104521 -0.293404 -0.360672 -0.280504 -0.293376 0.092797 0.519066 -0.221183 0.121393 0.338067 0.162504 0.687763 0.649376
7 -0.899225 -0.882547 -0.817798 -0.560915 -0.681064 -0.580956 -0.552272 -0.601186 -0.820160 -0.804270 -0.848748 -1.004262 -0.181778 0.741359 0.728732
Household Composition
Global View
In [185]:
# Main function to display boxplots by cluster
def plot_cluster_boxplots(df, columns, cluster_col='Cluster'):
    """
    Create boxplots for each variable by cluster.
    """
    n_cols = 3  # Nombre de boxplots par ligne
    n_rows = (len(columns) // n_cols) + (len(columns) % n_cols > 0)
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, n_rows * 4))
    axes = axes.flatten()
    
    for i, col in enumerate(columns):
        ax = axes[i]
        sns.boxplot(x=cluster_col, y=col, data=df, ax=ax, palette="icefire")
        ax.set_title(f'Distribution de {col} par Cluster', fontsize=14, fontweight='bold')
        ax.set_xlabel('Cluster', fontsize=12)
        ax.set_ylabel(col, fontsize=12)
        
    # Remove empty axes
    for ax in axes[len(columns):]:
        ax.set_visible(False)
    
    plt.tight_layout()
    plt.show()

# List of columns to display
columns = ['Income', 'Kidhome', 'Teenhome', 'Education_level', 'Adult_household', 'People_household', 'Age', 'SingleChildren', 'CoupleChildren']

# Displaying boxplots for all variables
plot_cluster_boxplots(df_clust, columns)
No description has been provided for this image
Cluster detail view
Analyse des dépenses suivant les caractéristiques de chaque cluster
In [189]:
import plotly.express as px
import plotly.graph_objects as go

'''
['aggrnyl', 'agsunset', 'algae', 'amp', 'armyrose', 'balance',
             'blackbody', 'bluered', 'blues', 'blugrn', 'bluyl', 'brbg',
             'brwnyl', 'bugn', 'bupu', 'burg', 'burgyl', 'cividis', 'curl',
             'darkmint', 'deep', 'delta', 'dense', 'earth', 'edge', 'electric',
             'emrld', 'fall', 'geyser', 'gnbu', 'gray', 'greens', 'greys',
             'haline', 'hot', 'hsv', 'ice', 'icefire', 'inferno', 'jet',
             'magenta', 'magma', 'matter', 'mint', 'mrybm', 'mygbm', 'oranges',
             'orrd', 'oryel', 'oxy', 'peach', 'phase', 'picnic', 'pinkyl',
             'piyg', 'plasma', 'plotly3', 'portland', 'prgn', 'pubu', 'pubugn',
             'puor', 'purd', 'purp', 'purples', 'purpor', 'rainbow', 'rdbu',
             'rdgy', 'rdpu', 'rdylbu', 'rdylgn', 'redor', 'reds', 'solar',
             'spectral', 'speed', 'sunset', 'sunsetdark', 'teal', 'tealgrn',
             'tealrose', 'tempo', 'temps', 'thermal', 'tropic', 'turbid',
             'turbo', 'twilight', 'viridis', 'ylgn', 'ylgnbu', 'ylorbr',
             'ylorrd'].
'''


# Création du graphique interactif
fig = px.scatter(
    df_clust,
    x='Cluster',
    y='MntTotal',
    size='Education_level',
    color='People_household',
    #facet_col='Education_level',
    color_continuous_scale='purples',
    size_max= 10,
    hover_data=['People_household', 'Education_level', 'MntTotal'],
    title='Distribution par Cluster'
)

# Personnalisation du graphique
fig.update_layout(
    title={'text': 'Distribution par Cluster','x': 0.5,'font': {'size': 20, 'color': 'firebrick'}},
    xaxis_title='Cluster',
    yaxis_title='MntTotal',
    legend_title='Education Level',
    width=1100,
    height=500,
    showlegend=True,
    legend={'x': 1.05, 'y': 1},
    plot_bgcolor='white',  # Fond du graphique
    paper_bgcolor='white'   
    
)

# Ajouter une grille
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGray')

# Ajouter le contour noir aux points
fig.update_traces(marker=dict(line=dict(color='black', width=0.8)))

# Afficher le graphique
fig.show()
In [191]:
# Fonction principale pour afficher les boxplots par variable pour chaque cluster
def plot_boxplots_by_cluster(df, columns, cluster_col='Cluster'):
    """
    Crée des boxplots pour chaque variable pour chaque cluster.
    """
    clusters = df[cluster_col].unique()

    for cluster in clusters:
        # Filtrer uniquement les données du cluster courant
        df_cluster = df[df[cluster_col] == cluster]
        
        n_cols = len(columns)
        n_rows = (len(columns) // n_cols) + (len(columns) % n_cols > 0)
        
        fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, n_rows * 3))
        axes = axes.flatten()

        Tmess(f'Identity card - Cluster {cluster} - contains {df_cluster.shape[0]} individuals', Size=18, Weight='bold', Color='firebrick', Align='center')
        
        for i, col in enumerate(columns):
            ax = axes[i]
            sns.boxplot(y=col, data=df_cluster, ax=ax, palette="terrain")
            
            # Afficher les informations
            ax.set_title(f'{col}', fontsize=14, fontweight='bold')
            ax.set_xlabel('')
            ax.set_ylabel(col, fontsize=12)
            # Ajouter la médiane en rouge et épaisse
            median_value = df_cluster[col].median()
            ax.axhline(y=median_value, color='firebrick', linestyle='-', linewidth=2)
            mean_value = df_cluster[col].mean()
            ax.axhline(y=mean_value, color='teal', linestyle='-', linewidth=2)
            ax.text(x=-0.4, y=median_value, s='med', color='firebrick', fontsize=12, fontweight='bold')
            ax.text(x=0.2, y=mean_value, s='mean', color='teal', fontsize=12, fontweight='bold')

        # Supprimer les axes vides
        for ax in axes[len(columns):]:
            ax.set_visible(False)
        
        plt.tight_layout()
        plt.show()

        # Création du scatter plot si les colonnes nécessaires existent
        required_cols = ['People_household', 'MntTotal', 'Adult_household', 'Education_level']
        if all(col in df.columns for col in required_cols):
            df_HH = df_cluster.copy()
            
            # Ajout d'un petit décalage pour une meilleure visualisation
            df_HH.loc[df_HH['Adult_household'] == 2, 'People_household'] += 0.05
            df_HH.loc[df_HH['Adult_household'] == 1, 'People_household'] -= 0.05
            
            if len(df_HH) > 0:
                plt.figure(figsize=(12, 3))
                scatter = sns.scatterplot(
                    data=df_HH, 
                    x='People_household', 
                    y='MntTotal', 
                    size='Adult_household',
                    hue='Education_level',
                    palette='icefire',
                    sizes=(30, 80)
                )
                
                plt.title('Relation People_household vs MntTotal', fontsize=12, color='firebrick')
                plt.ylabel('MntTotal', fontsize=12)
                plt.xlabel('People_household', fontsize=12)
                plt.legend(title='', bbox_to_anchor=(1.05, 1), loc='upper left')
                plt.grid(axis='x', linestyle='--', alpha=0.7)
                plt.xticks(np.arange(1, 6))
                
                plt.tight_layout()
                plt.show()
                Tmess('\n -------------------------------------------- \n', Align='center')
In [193]:
df_clust = df_clust.sort_values(by='Cluster', ascending=True)
# List of columns to display
columns = ['MntTotal', 'Total_Purchases', 'Income', 'Education_level', 'Age', 'Kidhome', 'Teenhome', 'Adult_household', 'People_household', 'SingleChildren', 'CoupleChildren']
# Displaying boxplots by cluster
plot_boxplots_by_cluster(df_clust, columns)
Identity card - Cluster 0 - contains 143 individuals
No description has been provided for this image
No description has been provided for this image
--------------------------------------------
Identity card - Cluster 1 - contains 360 individuals
No description has been provided for this image
No description has been provided for this image
--------------------------------------------
Identity card - Cluster 2 - contains 185 individuals
No description has been provided for this image
No description has been provided for this image
--------------------------------------------
Identity card - Cluster 3 - contains 178 individuals
No description has been provided for this image
No description has been provided for this image
--------------------------------------------
Identity card - Cluster 4 - contains 231 individuals
No description has been provided for this image
No description has been provided for this image
--------------------------------------------
Identity card - Cluster 5 - contains 192 individuals
No description has been provided for this image
No description has been provided for this image
--------------------------------------------
Identity card - Cluster 6 - contains 312 individuals
No description has been provided for this image
No description has been provided for this image
--------------------------------------------
Identity card - Cluster 7 - contains 601 individuals
No description has been provided for this image
No description has been provided for this image
--------------------------------------------

Cluster Analysis¶

Cluster 0¶

  • Average Spending: €1,850 across approximately 20 purchases.
  • Composition: Mainly couples without children.
  • Income: Significant (average €80k).
  • Education: Level 3+.

Cluster 1¶

  • Average Spending: €100 across 5 to 10 purchases.
  • Composition: Single individuals with 1 or 2 children.
  • Income: Relatively low (average €35k).
  • Education: Level 3+.

Cluster 2¶

  • Average Spending: €1,250 across approximately 20 purchases.
  • Composition: Mainly single individuals without children.
  • Income: High (average €75k).
  • Education: Level 3+.
  • Age: Higher.

Cluster 3¶

  • Average Spending: €1,200 across approximately 20 purchases.
  • Composition: Couples, potentially with 1 teenager.
  • Income: High (average €70k).
  • Education: Level 3.

Cluster 4¶

  • Average Spending: €1,200 across 25 purchases.
  • Composition: Couples with 1 teenager.
  • Income: High (average €70k).
  • Education: Level 4+.
  • Age: Higher.

Cluster 5¶

  • Average Spending: €750 across approximately 20 purchases.
  • Composition: Single individuals with 1 teenager.
  • Income: Good level (average €60k).
  • Education: Level 3++.
  • Age: Higher.

Cluster 6¶

  • Average Spending: €450 across fewer than 20 purchases.
  • Composition: Couples with at least 1 child.
  • Income: Good level (average €55k).
  • Education: Level 3++.

Cluster 7¶

  • Average Spending: Less than €100 across fewer than 10 purchases.
  • Composition: Exclusively couples with at least 1 young child, possibly 1 teenager.
  • Income: Low (average €35k).
  • Education: Level 3+.
Analyse des campagnes acceptées par cluster
In [195]:
columns = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Response']
campaign = {} 

# Crée des sous-graphiques (2 lignes, 3 colonnes)
fig, axes = plt.subplots(2, 3, figsize=(18, 8))  # 2 lignes x 3 colonnes, taille adaptée

for i, col in enumerate(columns):
    campaign[col] = df_clust[df_clust[col] == 1]
    campaign_clust = campaign[col].groupby(['Cluster']).size().reset_index(name='count')
    
    row = i // 3 
    col_pos = i % 3  
    sns.barplot(ax=axes[row, col_pos], data=campaign_clust, x='Cluster', y='count', palette='icefire')
    x_pos = 0.7  # Position x au centre de l'axe
    y_pos = axes[row, col_pos].get_ylim()[1] * 0.95 
    axes[row, col_pos].text(x_pos, y_pos, f'Total accepted: {campaign[col].shape[0]}', 
                        fontsize=14, color='black', fontweight='bold')
    axes[row, col_pos].set_title(col, fontsize=18, color='firebrick', fontweight='bold')
    axes[row, col_pos].set_xlabel('Cluster')
    axes[row, col_pos].set_ylabel('Count')

# Ajustement de l'espacement entre les sous-graphiques
plt.tight_layout()
plt.show()
No description has been provided for this image
In [198]:
# Creating dfs for analysis
cluster_dfs = {}
for i in range (k):
    df_cluster = df_clust[df_clust['Cluster']==i]
    cluster_dfs[i] = df_cluster[columns]
In [200]:
for i in range(k):
    
    plt.figure(figsize=(10, 3))
    
    # Filtrer les colonnes dont la somme est différente de 0
    filtered_columns = cluster_dfs[i].loc[:, cluster_dfs[i].sum() != 0].columns
    # Créer le dictionnaire des positions
    x_positions = {col: idx for idx, col in enumerate(filtered_columns)}
    
    for col in cluster_dfs[i].columns:
        # Récupérer les index où la valeur est 1
        indices = cluster_dfs[i][cluster_dfs[i][col] == 1].index
        if len(indices) > 0:  # S'il y a des 1 dans cette colonne
            plt.scatter([col] * len(indices), indices, label=col, alpha=0.8, s=20)
            # Ajouter le nombre de points au-dessus de la colonne, décalé vers la droite
            plt.text(x_positions[col] + 0.1, cluster_dfs[i].index.max() - 200, f'{len(indices)}', color='firebrick',
                    ha='left', va='bottom')
    
    plt.title(f'Cluster {i} - Campagnes acceptées', color='firebrick', size=14)
    plt.xlabel('')
    plt.ylabel('Customers Index')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.xticks('')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
    
    plt.ylim(cluster_dfs[i].index.min() - 50, cluster_dfs[i].index.max() + 100)
    
    plt.tight_layout()
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Distribution des individus par cluster
In [202]:
clustnb = 5

# Création du graphique
plt.figure(figsize=(12, 2.5))
scatter = sns.scatterplot(
    data=df_clust[df_clust['Cluster']==clustnb], 
    x='MntTotal', 
    y='Cluster', 
    size='People_household',  # Taille des bulles
    hue='Education_level',   # Couleur basée sur les exports
    palette='icefire', 
    sizes=(10, 100),  # Taille min et max des bulles
    legend='brief'
)

# Personnalisation
plt.title('Cluster ', fontsize=16, color='firebrick')
plt.xlabel('Cluster', fontsize=12)
plt.ylabel('MntTotal', fontsize=12)
plt.legend(title='People_household', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(axis='x', linestyle='--', alpha=0.7)

# Afficher le graphique
plt.tight_layout()
plt.show()
No description has been provided for this image
Modèle de prédiction
In [237]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import f1_score, matthews_corrcoef
from sklearn.metrics import classification_report
In [239]:
# Préparation des données
df_camp = df_Prepared.copy()
df_camp['Marital'] = df_camp['Marital'].replace({'Single': 1, 'Married': 2, 'Divorced': 3, 'Together': 4, 'Widow': 5})

# Colonnes à supprimer et à analyser
coldel = ['AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5',  'Response', 'AcceptedCmpOverall']
colaccep = ['AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp2', 'AcceptedCmp1', 'Response']
Prédiction campagne 1
In [242]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import FunctionTransformer
# Data standardization

scaler = MinMaxScaler()
# Transformation logarithmique (log(1 + x))
#scaler = FunctionTransformer(np.log1p, validate=True)  # Utilisation de log(1 + x) pour gérer les zéros
#scaler = PowerTransformer(method='yeo-johnson')  # Ou 'box-cox'
#scaler = StandardScaler()
In [244]:
scaled_data = scaler.fit_transform(df_camp)
df_camp = pd.DataFrame(scaled_data, columns=df.columns)
In [246]:
# Filtrer les colonnes numériques
campain1 = df_camp.drop(coldel, axis=1)

# Calcul des corrélations Kendall pour toutes les variables par rapport à 'AcceptedCmp1'
correlations = campain1.corr(method='kendall')['AcceptedCmp1'].sort_values(ascending=False)

# Créer un DataFrame pour visualiser
correlation_df = correlations.reset_index()
correlation_df.columns = ['Feature', 'Correlation']

# Exclure la variable cible elle-même (si nécessaire)
correlation_df = correlation_df[correlation_df['Feature'] != 'AcceptedCmp1']

# Visualiser avec un barplot
plt.figure(figsize=(8, 4))
sns.barplot(data=correlation_df, x='Correlation', y='Feature', palette='coolwarm')
plt.title('Correlations with AcceptedCmp1 (Kendall)', fontsize=16, weight='bold', color='Firebrick')
plt.axvline(0, color='gray', linestyle='--')
plt.xlabel('Correlation Coefficient', fontsize=12)
plt.ylabel('')
plt.yticks(fontsize=6)
plt.grid(True, axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [248]:
coldel1 = ['AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp2', 'Complain', 'Response', 'AcceptedCmp1', 
           'AcceptedCmpOverall', 'Age', 'Adult_household', 'Education_level', 'Customer_Days', 'Recency', 'Marital']
In [250]:
df_camp.columns
Out[250]:
Index(['Income', 'Kidhome', 'Teenhome', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Response', 'Age', 'Customer_Days',
       'AcceptedCmpOverall', 'MntTotal', 'MntRegularProds', 'Education_level',
       'Adult_household', 'Marital', 'People_household', 'Total_Purchases'],
      dtype='object')
In [254]:
# Diviser les données
X = df_camp.drop(coldel1, axis=1)
y = df_camp['AcceptedCmp1']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# SMOTE pour équilibrer les classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Utilisation de ADASYN à la place de SMOTE
#adasyn = ADASYN(random_state=42)
#X_resampled, y_resampled = adasyn.fit_resample(X_train, y_train)

# Calculer le ratio des classes
ratio_of_classes = len(y_resampled) / sum(y_resampled == 1)
In [256]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

# Définir la grille de recherche pour XGBoost
param_grid_xgb = {
    'n_estimators': [200, 220],
    'max_depth': [10, 12],
    'learning_rate': [0.1],
    'scale_pos_weight': [ratio_of_classes],  # Essayer différents poids
    'colsample_bytree': [0.5, 0.7],
    'subsample': [0.6, 0.8]
}

# Définir la grille de recherche pour Random Forest
param_grid_rf = {
    'n_estimators': [200, 220],
    'max_depth': [None],
    'class_weight': ['balanced']
}

# Appliquer GridSearchCV pour XGBoost
grid_search_xgb = GridSearchCV(XGBClassifier(random_state=42), param_grid_xgb, scoring='f1', cv=5)
grid_search_xgb.fit(X_resampled, y_resampled)
best_xgb = grid_search_xgb.best_estimator_
print("Best parameters:", grid_search_xgb.best_params_)

# Appliquer GridSearchCV pour RandomForest
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, scoring='f1', cv=5)
grid_search_rf.fit(X_resampled, y_resampled)
best_rf = grid_search_rf.best_estimator_
print("Best parameters:", grid_search_rf.best_params_)

# Créer un modèle d'ensemble avec les meilleurs modèles
model_Camp1 = VotingClassifier(estimators=[('rf', best_rf), ('xgb', best_xgb)], voting='soft')
model_Camp1.fit(X_resampled, y_resampled)

# Prédictions et évaluation
y_pred = model_Camp1.predict(X_test)
print(classification_report(y_test, y_pred))

# Calcul du F1-score et MCC
f1 = f1_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)

print(f"F1-score: {f1}")
print(f"MCC: {mcc}")
Best parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 200, 'scale_pos_weight': 2.0, 'subsample': 0.8}
Best parameters: {'class_weight': 'balanced', 'max_depth': None, 'n_estimators': 200}
              precision    recall  f1-score   support

         0.0       0.97      0.95      0.96       414
         1.0       0.46      0.59      0.52        27

    accuracy                           0.93       441
   macro avg       0.72      0.77      0.74       441
weighted avg       0.94      0.93      0.94       441

F1-score: 0.5161290322580645
MCC: 0.484878795449813
In [258]:
from sklearn.metrics import roc_curve, auc
# Calcul de la courbe ROC et AUC
fpr, tpr, thresholds = roc_curve(y_test, model_Camp1.predict_proba(X_test)[:,1])
roc_auc = auc(fpr, tpr)

# Affichage de la courbe ROC
plt.figure(figsize=(8, 4))
plt.plot(fpr, tpr, color='firebrick', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()
No description has been provided for this image
In [260]:
from joblib import dump, load
from sklearn.metrics import roc_curve, auc
# Sauvegarder du modèle
dump(model_Camp1, "model_Camp1.joblib")
# Chargement du modèle
model = load("model_Camp1.joblib")
model
Out[260]:
VotingClassifier(estimators=[('rf',
                              RandomForestClassifier(class_weight='balanced',
                                                     n_estimators=200,
                                                     random_state=42)),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=0.7, device=None,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=None,
                                            feature_types=None, ga...
                                            grow_policy=None,
                                            importance_type=None,
                                            interaction_constraints=None,
                                            learning_rate=0.1, max_bin=None,
                                            max_cat_threshold=None,
                                            max_cat_to_onehot=None,
                                            max_delta_step=None, max_depth=10,
                                            max_leaves=None,
                                            min_child_weight=None, missing=nan,
                                            monotone_constraints=None,
                                            multi_strategy=None,
                                            n_estimators=200, n_jobs=None,
                                            num_parallel_tree=None,
                                            random_state=42, ...))],
                 voting='soft')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
VotingClassifier(estimators=[('rf',
                              RandomForestClassifier(class_weight='balanced',
                                                     n_estimators=200,
                                                     random_state=42)),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=0.7, device=None,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=None,
                                            feature_types=None, ga...
                                            grow_policy=None,
                                            importance_type=None,
                                            interaction_constraints=None,
                                            learning_rate=0.1, max_bin=None,
                                            max_cat_threshold=None,
                                            max_cat_to_onehot=None,
                                            max_delta_step=None, max_depth=10,
                                            max_leaves=None,
                                            min_child_weight=None, missing=nan,
                                            monotone_constraints=None,
                                            multi_strategy=None,
                                            n_estimators=200, n_jobs=None,
                                            num_parallel_tree=None,
                                            random_state=42, ...))],
                 voting='soft')
RandomForestClassifier(class_weight='balanced', n_estimators=200,
                       random_state=42)
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.7, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=10, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=200, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...)
In [261]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
coldel1 = ['AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp2', 'Complain', 'Response', 'AcceptedCmp1', 
           'AcceptedCmpOverall', 'Age', 'Adult_household', 'Education_level', 'Customer_Days', 'Recency', 'Marital']

X = df_camp.drop(coldel1, axis=1)
y_true = df_camp['AcceptedCmp1']

# Effectuer les prédictions
y_pred = model.predict(X)

# Calculer les métriques
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, zero_division=0)
recall = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)

print(accuracy, precision, recall, f1)
print(classification_report(y_true, y_pred))
print(df_camp['AcceptedCmp1'].value_counts(normalize=True))
cm = confusion_matrix(y_true, y_pred)
print(cm)
auc = roc_auc_score(y_true, model.predict_proba(X)[:, 1])
print(f"ROC-AUC: {auc}")
0.9863760217983651 0.8733333333333333 0.9225352112676056 0.8972602739726028
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99      2060
         1.0       0.87      0.92      0.90       142

    accuracy                           0.99      2202
   macro avg       0.93      0.96      0.94      2202
weighted avg       0.99      0.99      0.99      2202

AcceptedCmp1
0.0    0.935513
1.0    0.064487
Name: proportion, dtype: float64
[[2041   19]
 [  11  131]]
ROC-AUC: 0.9942738958019964
In [264]:
# Importance des variables pour Random Forest
rf_importances = best_rf.feature_importances_
rf_features = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_importances
}).sort_values(by='Importance', ascending=False)

# Importance des variables pour XGBoost
xgb_importances = best_xgb.feature_importances_
xgb_features = pd.DataFrame({
    'Feature': X.columns,
    'Importance': xgb_importances
}).sort_values(by='Importance', ascending=False)

# Fusionner les importances
combined_importances = pd.DataFrame({
    'Feature': X.columns,
    'RandomForest_Importance': rf_importances,
    'XGBoost_Importance': xgb_importances
}).set_index('Feature')

# Moyenne des importances pour chaque variable
combined_importances['Average_Importance'] = combined_importances.mean(axis=1)

# Trier par importance moyenne
combined_importances = combined_importances.sort_values(by='Average_Importance', ascending=False)

print("Importances combinées des variables:")
combined_importances
Importances combinées des variables:
Out[264]:
RandomForest_Importance XGBoost_Importance Average_Importance
Feature
Income 0.190842 0.199219 0.195030
MntRegularProds 0.090077 0.202953 0.146515
MntWines 0.088519 0.077272 0.082896
NumCatalogPurchases 0.104712 0.047787 0.076249
NumDealsPurchases 0.035666 0.095532 0.065599
MntTotal 0.080213 0.030274 0.055244
NumStorePurchases 0.059832 0.036734 0.048283
NumWebVisitsMonth 0.042369 0.045146 0.043757
MntMeatProducts 0.059955 0.018466 0.039211
MntFruits 0.036686 0.040942 0.038814
Total_Purchases 0.050242 0.023149 0.036695
MntGoldProds 0.027894 0.026649 0.027271
Teenhome 0.015396 0.039016 0.027206
NumWebPurchases 0.031306 0.021557 0.026431
MntSweetProducts 0.033143 0.018160 0.025651
MntFishProducts 0.027289 0.020559 0.023924
Kidhome 0.008949 0.032364 0.020657
People_household 0.016911 0.024223 0.020567
Prédiction campagne 2
In [268]:
# Colonnes à supprimer et à analyser
coldel = ['AcceptedCmp1', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5',  'Response', 'AcceptedCmpOverall']
In [270]:
# Filtrer les colonnes numériques
campain2 = df_camp.drop(coldel, axis=1)

# Calcul des corrélations Kendall pour toutes les variables par rapport à 'AcceptedCmp1'
correlations = campain2.corr(method='kendall')['AcceptedCmp2'].sort_values(ascending=False)

# Créer un DataFrame pour visualiser
correlation_df = correlations.reset_index()
correlation_df.columns = ['Feature', 'Correlation']

# Exclure la variable cible elle-même (si nécessaire)
correlation_df = correlation_df[correlation_df['Feature'] != 'AcceptedCmp2']

# Visualiser avec un barplot
plt.figure(figsize=(8, 4))
sns.barplot(data=correlation_df, x='Correlation', y='Feature', palette='coolwarm')
plt.title('Correlations with AcceptedCmp2 (Kendall)', fontsize=16, weight='bold', color='Firebrick')
plt.axvline(0, color='gray', linestyle='--')
plt.xlabel('Correlation Coefficient', fontsize=12)
plt.ylabel('')
plt.yticks(fontsize=6)
plt.grid(True, axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [271]:
coldel1 = ['AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp2', 'Complain', 'Response', 'AcceptedCmp1', 
           'AcceptedCmpOverall', 'Age', 'MntFruits', 'MntFishProducts', 'MntSweetProducts', 'Recency', 'NumWebVisitsMonth',
           'Adult_household', 'Customer_Days']

coldel1 = ['AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp2', 'Complain', 'Response', 'AcceptedCmp1', 
           'AcceptedCmpOverall', 'MntFruits', 'MntFishProducts', 'MntSweetProducts', 'Recency', 'Customer_Days', 'Adult_household']
In [274]:
df_camp.columns
Out[274]:
Index(['Income', 'Kidhome', 'Teenhome', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Response', 'Age', 'Customer_Days',
       'AcceptedCmpOverall', 'MntTotal', 'MntRegularProds', 'Education_level',
       'Adult_household', 'Marital', 'People_household', 'Total_Purchases'],
      dtype='object')
In [276]:
# Diviser les données
X = df_camp.drop(coldel1, axis=1)
y = df_camp['AcceptedCmp2']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# SMOTE pour équilibrer les classes
#smote = SMOTE(random_state=42)
#X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Utilisation de ADASYN à la place de SMOTE
adasyn = ADASYN(random_state=42)
X_resampled, y_resampled = adasyn.fit_resample(X_train, y_train)

# Calculer le ratio des classes
ratio_of_classes = len(y_resampled) / sum(y_resampled == 1)
In [278]:
# Définir la grille de recherche pour XGBoost
param_grid_xgb = {
    'n_estimators': [220, 250],
    'max_depth': [8, 10],
    'learning_rate': [0.1],
    'scale_pos_weight': [ratio_of_classes],  # Essayer différents poids
    'colsample_bytree': [0.3, 0.5],
    'subsample': [0.4, 0.6]
}

# Définir la grille de recherche pour Random Forest
param_grid_rf = {
    'n_estimators': [180, 200],
    'max_depth': [None],
    'class_weight': ['balanced']
}

# Appliquer GridSearchCV pour XGBoost
grid_search_xgb = GridSearchCV(XGBClassifier(random_state=42), param_grid_xgb, scoring='f1', cv=5)
grid_search_xgb.fit(X_resampled, y_resampled)
best_xgb = grid_search_xgb.best_estimator_
print("Best parameters:", grid_search_xgb.best_params_)

# Appliquer GridSearchCV pour RandomForest
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, scoring='f1', cv=5)
grid_search_rf.fit(X_resampled, y_resampled)
best_rf = grid_search_rf.best_estimator_
print("Best parameters:", grid_search_rf.best_params_)

# Créer un modèle d'ensemble avec les meilleurs modèles
model_Camp2 = VotingClassifier(estimators=[('rf', best_rf), ('xgb', best_xgb)], voting='soft')
model_Camp2.fit(X_resampled, y_resampled)

# Prédictions et évaluation
y_pred = model_Camp2.predict(X_test)
print(classification_report(y_test, y_pred))

# Calcul du F1-score et MCC
f1 = f1_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)

print(f"F1-score: {f1}")
print(f"MCC: {mcc}")
Best parameters: {'colsample_bytree': 0.3, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 250, 'scale_pos_weight': 2.0005763688760805, 'subsample': 0.6}
Best parameters: {'class_weight': 'balanced', 'max_depth': None, 'n_estimators': 180}
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99       436
         1.0       0.29      0.40      0.33         5

    accuracy                           0.98       441
   macro avg       0.64      0.69      0.66       441
weighted avg       0.99      0.98      0.98       441

F1-score: 0.3333333333333333
MCC: 0.32912530817577573
In [280]:
from sklearn.metrics import auc
# Calcul de la courbe ROC et AUC
fpr, tpr, thresholds = roc_curve(y_test, model_Camp2.predict_proba(X_test)[:,1])
roc_auc = auc(fpr, tpr)

# Affichage de la courbe ROC
plt.figure(figsize=(8, 4))
plt.plot(fpr, tpr, color='firebrick', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()
No description has been provided for this image
In [282]:
from joblib import dump, load
from sklearn.metrics import roc_curve, auc
# Sauvegarder du modèle
dump(model_Camp2, "model_Camp2.joblib")
# Chargement du modèle
model2 = load("model_Camp2.joblib")
model2
Out[282]:
VotingClassifier(estimators=[('rf',
                              RandomForestClassifier(class_weight='balanced',
                                                     n_estimators=180,
                                                     random_state=42)),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=0.3, device=None,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=None,
                                            feature_types=None, ga...
                                            grow_policy=None,
                                            importance_type=None,
                                            interaction_constraints=None,
                                            learning_rate=0.1, max_bin=None,
                                            max_cat_threshold=None,
                                            max_cat_to_onehot=None,
                                            max_delta_step=None, max_depth=10,
                                            max_leaves=None,
                                            min_child_weight=None, missing=nan,
                                            monotone_constraints=None,
                                            multi_strategy=None,
                                            n_estimators=250, n_jobs=None,
                                            num_parallel_tree=None,
                                            random_state=42, ...))],
                 voting='soft')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
VotingClassifier(estimators=[('rf',
                              RandomForestClassifier(class_weight='balanced',
                                                     n_estimators=180,
                                                     random_state=42)),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=0.3, device=None,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=None,
                                            feature_types=None, ga...
                                            grow_policy=None,
                                            importance_type=None,
                                            interaction_constraints=None,
                                            learning_rate=0.1, max_bin=None,
                                            max_cat_threshold=None,
                                            max_cat_to_onehot=None,
                                            max_delta_step=None, max_depth=10,
                                            max_leaves=None,
                                            min_child_weight=None, missing=nan,
                                            monotone_constraints=None,
                                            multi_strategy=None,
                                            n_estimators=250, n_jobs=None,
                                            num_parallel_tree=None,
                                            random_state=42, ...))],
                 voting='soft')
RandomForestClassifier(class_weight='balanced', n_estimators=180,
                       random_state=42)
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.3, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=10, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=250, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...)
In [286]:
coldel1 = ['AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp2', 'Complain', 'Response', 'AcceptedCmp1', 
           'AcceptedCmpOverall', 'MntFruits', 'MntFishProducts', 'MntSweetProducts', 'Recency', 'Customer_Days', 'Adult_household']

X = df_camp.drop(coldel1, axis=1)
y_true = df_camp['AcceptedCmp2']

# Effectuer les prédictions
y_pred = model2.predict(X)

# Calculer les métriques
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, zero_division=0)
recall = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)

print(accuracy, precision, recall, f1)
print(classification_report(y_true, y_pred))
print(df_camp['AcceptedCmp2'].value_counts(normalize=True))
cm = confusion_matrix(y_true, y_pred)
print(cm)
auc = roc_auc_score(y_true, model2.predict_proba(X)[:, 1])
print(f"ROC-AUC: {auc}")
0.9963669391462306 0.84375 0.9 0.8709677419354839
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      2172
         1.0       0.84      0.90      0.87        30

    accuracy                           1.00      2202
   macro avg       0.92      0.95      0.93      2202
weighted avg       1.00      1.00      1.00      2202

AcceptedCmp2
0.0    0.986376
1.0    0.013624
Name: proportion, dtype: float64
[[2167    5]
 [   3   27]]
ROC-AUC: 0.9895794966236955
Prédiction campagne 3
In [289]:
# Colonnes à supprimer et à analyser
coldel = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp4', 'AcceptedCmp5', 'Response', 'AcceptedCmpOverall']
In [291]:
# Filtrer les colonnes numériques
campain3 = df_camp.drop(coldel, axis=1)

# Calcul des corrélations Kendall pour toutes les variables par rapport à 'AcceptedCmp1'
correlations = campain3.corr(method='kendall')['AcceptedCmp3'].sort_values(ascending=False)

# Créer un DataFrame pour visualiser
correlation_df = correlations.reset_index()
correlation_df.columns = ['Feature', 'Correlation']

# Exclure la variable cible elle-même (si nécessaire)
correlation_df = correlation_df[correlation_df['Feature'] != 'AcceptedCmp3']

# Visualiser avec un barplot
plt.figure(figsize=(8, 4))
sns.barplot(data=correlation_df, x='Correlation', y='Feature', palette='coolwarm')
plt.title('Correlations with AcceptedCmp3 (Kendall)', fontsize=16, weight='bold', color='Firebrick')
plt.axvline(0, color='gray', linestyle='--')
plt.xlabel('Correlation Coefficient', fontsize=12)
plt.ylabel('')
plt.yticks(fontsize=6)
plt.grid(True, axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [293]:
coldel1 = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Complain', 'Response', 'AcceptedCmpOverall']
In [295]:
df_camp.columns
Out[295]:
Index(['Income', 'Kidhome', 'Teenhome', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Response', 'Age', 'Customer_Days',
       'AcceptedCmpOverall', 'MntTotal', 'MntRegularProds', 'Education_level',
       'Adult_household', 'Marital', 'People_household', 'Total_Purchases'],
      dtype='object')
In [297]:
# Diviser les données
X = df_camp.drop(coldel1, axis=1)
y = df_camp['AcceptedCmp3']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# SMOTE pour équilibrer les classes
#smote = SMOTE(random_state=42)
#X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Utilisation de ADASYN à la place de SMOTE
adasyn = ADASYN(random_state=42)
X_resampled, y_resampled = adasyn.fit_resample(X_train, y_train)

# Calculer le ratio des classes
ratio_of_classes = len(y_resampled) / sum(y_resampled == 1)
In [299]:
# Définir la grille de recherche pour XGBoost
param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1],
    'scale_pos_weight': [ratio_of_classes, 1],  # Essayer différents poids
    'colsample_bytree': [0.7, 1.0],
    'subsample': [0.8, 1.0]
}

# Définir la grille de recherche pour Random Forest
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'class_weight': ['balanced', 'balanced_subsample']
}

# Appliquer GridSearchCV pour XGBoost
grid_search_xgb = GridSearchCV(XGBClassifier(random_state=42), param_grid_xgb, scoring='f1', cv=5)
grid_search_xgb.fit(X_resampled, y_resampled)
best_xgb = grid_search_xgb.best_estimator_
print("Best parameters:", grid_search_xgb.best_params_)

# Appliquer GridSearchCV pour RandomForest
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, scoring='f1', cv=5)
grid_search_rf.fit(X_resampled, y_resampled)
best_rf = grid_search_rf.best_estimator_
print("Best parameters:", grid_search_rf.best_params_)

# Créer un modèle d'ensemble avec les meilleurs modèles
model_Camp3 = VotingClassifier(estimators=[('rf', best_rf), ('xgb', best_xgb)], voting='soft')
model_Camp3.fit(X_resampled, y_resampled)

# Prédictions et évaluation
y_pred = model_Camp3.predict(X_test)
print(classification_report(y_test, y_pred))

# Calcul du F1-score et MCC
f1 = f1_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)

print(f"F1-score: {f1}")
print(f"MCC: {mcc}")
Best parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 200, 'scale_pos_weight': 1, 'subsample': 0.8}
Best parameters: {'class_weight': 'balanced_subsample', 'max_depth': 20, 'n_estimators': 200}
              precision    recall  f1-score   support

         0.0       0.93      0.99      0.96       401
         1.0       0.62      0.20      0.30        40

    accuracy                           0.92       441
   macro avg       0.77      0.59      0.63       441
weighted avg       0.90      0.92      0.90       441

F1-score: 0.3018867924528302
MCC: 0.31840662051218027
In [300]:
from sklearn.metrics import auc
# Calcul de la courbe ROC et AUC
fpr, tpr, thresholds = roc_curve(y_test, model_Camp3.predict_proba(X_test)[:,1])
roc_auc = auc(fpr, tpr)

# Affichage de la courbe ROC
plt.figure(figsize=(8, 4))
plt.plot(fpr, tpr, color='firebrick', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()
No description has been provided for this image
In [301]:
from joblib import dump, load
from sklearn.metrics import roc_curve, auc
# Sauvegarder du modèle
dump(model_Camp3, "model_Camp3.joblib")
# Chargement du modèle
model3 = load("model_Camp3.joblib")
model3
Out[301]:
VotingClassifier(estimators=[('rf',
                              RandomForestClassifier(class_weight='balanced_subsample',
                                                     max_depth=20,
                                                     n_estimators=200,
                                                     random_state=42)),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=0.7, device=None,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=Non...
                                            grow_policy=None,
                                            importance_type=None,
                                            interaction_constraints=None,
                                            learning_rate=0.1, max_bin=None,
                                            max_cat_threshold=None,
                                            max_cat_to_onehot=None,
                                            max_delta_step=None, max_depth=10,
                                            max_leaves=None,
                                            min_child_weight=None, missing=nan,
                                            monotone_constraints=None,
                                            multi_strategy=None,
                                            n_estimators=200, n_jobs=None,
                                            num_parallel_tree=None,
                                            random_state=42, ...))],
                 voting='soft')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
VotingClassifier(estimators=[('rf',
                              RandomForestClassifier(class_weight='balanced_subsample',
                                                     max_depth=20,
                                                     n_estimators=200,
                                                     random_state=42)),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=0.7, device=None,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=Non...
                                            grow_policy=None,
                                            importance_type=None,
                                            interaction_constraints=None,
                                            learning_rate=0.1, max_bin=None,
                                            max_cat_threshold=None,
                                            max_cat_to_onehot=None,
                                            max_delta_step=None, max_depth=10,
                                            max_leaves=None,
                                            min_child_weight=None, missing=nan,
                                            monotone_constraints=None,
                                            multi_strategy=None,
                                            n_estimators=200, n_jobs=None,
                                            num_parallel_tree=None,
                                            random_state=42, ...))],
                 voting='soft')
RandomForestClassifier(class_weight='balanced_subsample', max_depth=20,
                       n_estimators=200, random_state=42)
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.7, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=10, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=200, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...)
In [302]:
X = df_camp.drop(coldel1, axis=1)
y_true = df_camp['AcceptedCmp3']

# Effectuer les prédictions
y_pred = model3.predict(X)

# Calculer les métriques
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, zero_division=0)
recall = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)

print(accuracy, precision, recall, f1)
print(classification_report(y_true, y_pred))
print(df_camp['AcceptedCmp3'].value_counts(normalize=True))
cm = confusion_matrix(y_true, y_pred)
print(cm)
auc = roc_auc_score(y_true, model3.predict_proba(X)[:, 1])
print(f"ROC-AUC: {auc}")
0.983197093551317 0.9632352941176471 0.803680981595092 0.8762541806020067
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99      2039
         1.0       0.96      0.80      0.88       163

    accuracy                           0.98      2202
   macro avg       0.97      0.90      0.93      2202
weighted avg       0.98      0.98      0.98      2202

AcceptedCmp3
0.0    0.925976
1.0    0.074024
Name: proportion, dtype: float64
[[2034    5]
 [  32  131]]
ROC-AUC: 0.976829132529178
Prédiction campagne 4
In [308]:
# Colonnes à supprimer et à analyser
coldel = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp5',  'Response', 'AcceptedCmpOverall']
In [310]:
# Filtrer les colonnes numériques
campain4 = df_camp.drop(coldel, axis=1)

# Calcul des corrélations Kendall pour toutes les variables par rapport à 'AcceptedCmp1'
correlations = campain4.corr(method='kendall')['AcceptedCmp4'].sort_values(ascending=False)

# Créer un DataFrame pour visualiser
correlation_df = correlations.reset_index()
correlation_df.columns = ['Feature', 'Correlation']

# Exclure la variable cible elle-même (si nécessaire)
correlation_df = correlation_df[correlation_df['Feature'] != 'AcceptedCmp4']

# Visualiser avec un barplot
plt.figure(figsize=(8, 4))
sns.barplot(data=correlation_df, x='Correlation', y='Feature', palette='coolwarm')
plt.title('Correlations with AcceptedCmp4 (Kendall)', fontsize=16, weight='bold', color='Firebrick')
plt.axvline(0, color='gray', linestyle='--')
plt.xlabel('Correlation Coefficient', fontsize=12)
plt.ylabel('')
plt.yticks(fontsize=6)
plt.grid(True, axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [311]:
coldel1 = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Complain', 'Response','AcceptedCmpOverall', 
          'MntFishProducts', 'MntSweetProducts', 'NumDealsPurchases']
In [314]:
df_camp.columns
Out[314]:
Index(['Income', 'Kidhome', 'Teenhome', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Response', 'Age', 'Customer_Days',
       'AcceptedCmpOverall', 'MntTotal', 'MntRegularProds', 'Education_level',
       'Adult_household', 'Marital', 'People_household', 'Total_Purchases'],
      dtype='object')
In [316]:
# Diviser les données
X = df_camp.drop(coldel1, axis=1)
y = df_camp['AcceptedCmp4']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# SMOTE pour équilibrer les classes
#smote = SMOTE(random_state=42)
#X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Utilisation de ADASYN à la place de SMOTE
adasyn = ADASYN(random_state=42)
X_resampled, y_resampled = adasyn.fit_resample(X_train, y_train)

# Calculer le ratio des classes
ratio_of_classes = len(y_resampled) / sum(y_resampled == 1)
In [318]:
# Définir la grille de recherche pour XGBoost
param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1],
    'scale_pos_weight': [ratio_of_classes, 1],  # Essayer différents poids
    'colsample_bytree': [0.7, 1.0],
    'subsample': [0.8, 1.0]
}

# Définir la grille de recherche pour Random Forest
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'class_weight': ['balanced', 'balanced_subsample']
}

# Appliquer GridSearchCV pour XGBoost
grid_search_xgb = GridSearchCV(XGBClassifier(random_state=42), param_grid_xgb, scoring='f1', cv=5)
grid_search_xgb.fit(X_resampled, y_resampled)
best_xgb = grid_search_xgb.best_estimator_
print("Best parameters:", grid_search_xgb.best_params_)

# Appliquer GridSearchCV pour RandomForest
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, scoring='f1', cv=5)
grid_search_rf.fit(X_resampled, y_resampled)
best_rf = grid_search_rf.best_estimator_
print("Best parameters:", grid_search_rf.best_params_)

# Créer un modèle d'ensemble avec les meilleurs modèles
model_Camp4 = VotingClassifier(estimators=[('rf', best_rf), ('xgb', best_xgb)], voting='soft')
model_Camp4.fit(X_resampled, y_resampled)

# Prédictions et évaluation
y_pred = model_Camp4.predict(X_test)
print(classification_report(y_test, y_pred))

# Calcul du F1-score et MCC
f1 = f1_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)

print(f"F1-score: {f1}")
print(f"MCC: {mcc}")
Best parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 100, 'scale_pos_weight': 1.9969306322897482, 'subsample': 1.0}
Best parameters: {'class_weight': 'balanced_subsample', 'max_depth': 20, 'n_estimators': 100}
              precision    recall  f1-score   support

         0.0       0.96      0.96      0.96       414
         1.0       0.41      0.44      0.43        27

    accuracy                           0.93       441
   macro avg       0.69      0.70      0.69       441
weighted avg       0.93      0.93      0.93       441

F1-score: 0.42857142857142855
MCC: 0.39016703715254414
In [320]:
from sklearn.metrics import auc
# Calcul de la courbe ROC et AUC
fpr, tpr, thresholds = roc_curve(y_test, model_Camp4.predict_proba(X_test)[:,1])
roc_auc = auc(fpr, tpr)

# Affichage de la courbe ROC
plt.figure(figsize=(8, 4))
plt.plot(fpr, tpr, color='firebrick', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()
No description has been provided for this image
In [321]:
from joblib import dump, load
from sklearn.metrics import roc_curve, auc
# Sauvegarder du modèle
dump(model_Camp4, "model_Camp4.joblib")
# Chargement du modèle
model4 = load("model_Camp4.joblib")
model4
Out[321]:
VotingClassifier(estimators=[('rf',
                              RandomForestClassifier(class_weight='balanced_subsample',
                                                     max_depth=20,
                                                     random_state=42)),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=0.7, device=None,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=None,
                                            feature_types=N...
                                            grow_policy=None,
                                            importance_type=None,
                                            interaction_constraints=None,
                                            learning_rate=0.1, max_bin=None,
                                            max_cat_threshold=None,
                                            max_cat_to_onehot=None,
                                            max_delta_step=None, max_depth=10,
                                            max_leaves=None,
                                            min_child_weight=None, missing=nan,
                                            monotone_constraints=None,
                                            multi_strategy=None,
                                            n_estimators=100, n_jobs=None,
                                            num_parallel_tree=None,
                                            random_state=42, ...))],
                 voting='soft')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
VotingClassifier(estimators=[('rf',
                              RandomForestClassifier(class_weight='balanced_subsample',
                                                     max_depth=20,
                                                     random_state=42)),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=0.7, device=None,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=None,
                                            feature_types=N...
                                            grow_policy=None,
                                            importance_type=None,
                                            interaction_constraints=None,
                                            learning_rate=0.1, max_bin=None,
                                            max_cat_threshold=None,
                                            max_cat_to_onehot=None,
                                            max_delta_step=None, max_depth=10,
                                            max_leaves=None,
                                            min_child_weight=None, missing=nan,
                                            monotone_constraints=None,
                                            multi_strategy=None,
                                            n_estimators=100, n_jobs=None,
                                            num_parallel_tree=None,
                                            random_state=42, ...))],
                 voting='soft')
RandomForestClassifier(class_weight='balanced_subsample', max_depth=20,
                       random_state=42)
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.7, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=10, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...)
In [322]:
X = df_camp.drop(coldel1, axis=1)
y_true = df_camp['AcceptedCmp4']

# Effectuer les prédictions
y_pred = model4.predict(X)

# Calculer les métriques
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, zero_division=0)
recall = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)

print(accuracy, precision, recall, f1)
print(classification_report(y_true, y_pred))
print(df_camp['AcceptedCmp4'].value_counts(normalize=True))
cm = confusion_matrix(y_true, y_pred)
print(cm)
auc = roc_auc_score(y_true, model4.predict_proba(X)[:, 1])
print(f"ROC-AUC: {auc}")
0.9854677565849228 0.8975903614457831 0.9085365853658537 0.9030303030303031
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99      2038
         1.0       0.90      0.91      0.90       164

    accuracy                           0.99      2202
   macro avg       0.95      0.95      0.95      2202
weighted avg       0.99      0.99      0.99      2202

AcceptedCmp4
0.0    0.925522
1.0    0.074478
Name: proportion, dtype: float64
[[2021   17]
 [  15  149]]
ROC-AUC: 0.9892439981809042
Prédiction campagne 5
In [328]:
# Colonnes à supprimer et à analyser
coldel = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'Response', 'AcceptedCmpOverall']
In [330]:
# Filtrer les colonnes numériques
campain5 = df_camp.drop(coldel, axis=1)

# Calcul des corrélations Kendall pour toutes les variables par rapport à 'AcceptedCmp1'
correlations = campain5.corr(method='kendall')['AcceptedCmp5'].sort_values(ascending=False)

# Créer un DataFrame pour visualiser
correlation_df = correlations.reset_index()
correlation_df.columns = ['Feature', 'Correlation']

# Exclure la variable cible elle-même (si nécessaire)
correlation_df = correlation_df[correlation_df['Feature'] != 'AcceptedCmp5']

# Visualiser avec un barplot
plt.figure(figsize=(8, 4))
sns.barplot(data=correlation_df, x='Correlation', y='Feature', palette='coolwarm')
plt.title('Correlations with AcceptedCmp5 (Kendall)', fontsize=16, weight='bold', color='Firebrick')
plt.axvline(0, color='gray', linestyle='--')
plt.xlabel('Correlation Coefficient', fontsize=12)
plt.ylabel('')
plt.yticks(fontsize=6)
plt.grid(True, axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [331]:
coldel1 = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Complain', 'Response', 'AcceptedCmpOverall', 
          'Customer_Days', 'Recency', 'Marital', 'Age']
df_camp.columns
Out[331]:
Index(['Income', 'Kidhome', 'Teenhome', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Response', 'Age', 'Customer_Days',
       'AcceptedCmpOverall', 'MntTotal', 'MntRegularProds', 'Education_level',
       'Adult_household', 'Marital', 'People_household', 'Total_Purchases'],
      dtype='object')
In [334]:
# Diviser les données
X = df_camp.drop(coldel1, axis=1)
y = df_camp['AcceptedCmp5']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# SMOTE pour équilibrer les classes
#smote = SMOTE(random_state=42)
#X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Utilisation de ADASYN à la place de SMOTE
adasyn = ADASYN(random_state=42)
X_resampled, y_resampled = adasyn.fit_resample(X_train, y_train)

# Calculer le ratio des classes
ratio_of_classes = len(y_resampled) / sum(y_resampled == 1)
In [336]:
# Définir la grille de recherche pour XGBoost
param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1],
    'scale_pos_weight': [ratio_of_classes, 1],  # Essayer différents poids
    'colsample_bytree': [0.7, 1.0],
    'subsample': [0.8, 1.0]
}

# Définir la grille de recherche pour Random Forest
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'class_weight': ['balanced', 'balanced_subsample']
}

# Appliquer GridSearchCV pour XGBoost
grid_search_xgb = GridSearchCV(XGBClassifier(random_state=42), param_grid_xgb, scoring='f1', cv=5)
grid_search_xgb.fit(X_resampled, y_resampled)
best_xgb = grid_search_xgb.best_estimator_
print("Best parameters:", grid_search_xgb.best_params_)

# Appliquer GridSearchCV pour RandomForest
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, scoring='f1', cv=5)
grid_search_rf.fit(X_resampled, y_resampled)
best_rf = grid_search_rf.best_estimator_
print("Best parameters:", grid_search_rf.best_params_)

# Créer un modèle d'ensemble avec les meilleurs modèles
model_Camp5 = VotingClassifier(estimators=[('rf', best_rf), ('xgb', best_xgb)], voting='soft')
model_Camp5.fit(X_resampled, y_resampled)

# Prédictions et évaluation
y_pred = model_Camp5.predict(X_test)
print(classification_report(y_test, y_pred))

# Calcul du F1-score et MCC
f1 = f1_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)

print(f"F1-score: {f1}")
print(f"MCC: {mcc}")
Best parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200, 'scale_pos_weight': 1, 'subsample': 0.8}
Best parameters: {'class_weight': 'balanced', 'max_depth': None, 'n_estimators': 200}
              precision    recall  f1-score   support

         0.0       0.98      0.95      0.97       415
         1.0       0.47      0.69      0.56        26

    accuracy                           0.94       441
   macro avg       0.73      0.82      0.76       441
weighted avg       0.95      0.94      0.94       441

F1-score: 0.5625
MCC: 0.5406668729694817
In [338]:
from sklearn.metrics import auc
# Calcul de la courbe ROC et AUC
fpr, tpr, thresholds = roc_curve(y_test, model_Camp5.predict_proba(X_test)[:,1])
roc_auc = auc(fpr, tpr)

# Affichage de la courbe ROC
plt.figure(figsize=(8, 4))
plt.plot(fpr, tpr, color='firebrick', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()
No description has been provided for this image
In [339]:
from joblib import dump, load
from sklearn.metrics import roc_curve, auc
# Sauvegarder du modèle
dump(model_Camp5, "model_Camp5.joblib")
# Chargement du modèle
model5 = load("model_Camp5.joblib")
model5
Out[339]:
VotingClassifier(estimators=[('rf',
                              RandomForestClassifier(class_weight='balanced',
                                                     n_estimators=200,
                                                     random_state=42)),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=0.7, device=None,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=None,
                                            feature_types=None, ga...
                                            grow_policy=None,
                                            importance_type=None,
                                            interaction_constraints=None,
                                            learning_rate=0.1, max_bin=None,
                                            max_cat_threshold=None,
                                            max_cat_to_onehot=None,
                                            max_delta_step=None, max_depth=6,
                                            max_leaves=None,
                                            min_child_weight=None, missing=nan,
                                            monotone_constraints=None,
                                            multi_strategy=None,
                                            n_estimators=200, n_jobs=None,
                                            num_parallel_tree=None,
                                            random_state=42, ...))],
                 voting='soft')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
VotingClassifier(estimators=[('rf',
                              RandomForestClassifier(class_weight='balanced',
                                                     n_estimators=200,
                                                     random_state=42)),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=0.7, device=None,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=None,
                                            feature_types=None, ga...
                                            grow_policy=None,
                                            importance_type=None,
                                            interaction_constraints=None,
                                            learning_rate=0.1, max_bin=None,
                                            max_cat_threshold=None,
                                            max_cat_to_onehot=None,
                                            max_delta_step=None, max_depth=6,
                                            max_leaves=None,
                                            min_child_weight=None, missing=nan,
                                            monotone_constraints=None,
                                            multi_strategy=None,
                                            n_estimators=200, n_jobs=None,
                                            num_parallel_tree=None,
                                            random_state=42, ...))],
                 voting='soft')
RandomForestClassifier(class_weight='balanced', n_estimators=200,
                       random_state=42)
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.7, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=6, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=200, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...)
In [340]:
X = df_camp.drop(coldel1, axis=1)
y_true = df_camp['AcceptedCmp5']

# Effectuer les prédictions
y_pred = model5.predict(X)

# Calculer les métriques
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, zero_division=0)
recall = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)

print(accuracy, precision, recall, f1)
print(classification_report(y_true, y_pred))
print(df_camp['AcceptedCmp5'].value_counts(normalize=True))
cm = confusion_matrix(y_true, y_pred)
print(cm)
auc = roc_auc_score(y_true, model5.predict_proba(X)[:, 1])
print(f"ROC-AUC: {auc}")
0.9872842870118075 0.884393063583815 0.9503105590062112 0.9161676646706587
              precision    recall  f1-score   support

         0.0       1.00      0.99      0.99      2041
         1.0       0.88      0.95      0.92       161

    accuracy                           0.99      2202
   macro avg       0.94      0.97      0.95      2202
weighted avg       0.99      0.99      0.99      2202

AcceptedCmp5
0.0    0.926885
1.0    0.073115
Name: proportion, dtype: float64
[[2021   20]
 [   8  153]]
ROC-AUC: 0.9972824184953789
Prédiction campagne 6
In [346]:
# Colonnes à supprimer et à analyser
coldel = ['AcceptedCmp1', 'AcceptedCmp2','AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmpOverall']
In [348]:
# Filtrer les colonnes numériques
campain6 = df_camp.drop(coldel, axis=1)

# Calcul des corrélations Kendall pour toutes les variables par rapport à 'AcceptedCmp1'
correlations = campain6.corr(method='kendall')['Response'].sort_values(ascending=False)

# Créer un DataFrame pour visualiser
correlation_df = correlations.reset_index()
correlation_df.columns = ['Feature', 'Correlation']

# Exclure la variable cible elle-même (si nécessaire)
correlation_df = correlation_df[correlation_df['Feature'] != 'Response']

# Visualiser avec un barplot
plt.figure(figsize=(8, 4))
sns.barplot(data=correlation_df, x='Correlation', y='Feature', palette='coolwarm')
plt.title('Correlations with Response (Kendall)', fontsize=16, weight='bold', color='Firebrick')
plt.axvline(0, color='gray', linestyle='--')
plt.xlabel('Correlation Coefficient', fontsize=12)
plt.ylabel('')
plt.yticks(fontsize=6)
plt.grid(True, axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [349]:
coldel1 = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Complain','AcceptedCmpOverall', 
           'Response', 'NumWebVisitsMonth', 'Age']
df_camp.columns
Out[349]:
Index(['Income', 'Kidhome', 'Teenhome', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Response', 'Age', 'Customer_Days',
       'AcceptedCmpOverall', 'MntTotal', 'MntRegularProds', 'Education_level',
       'Adult_household', 'Marital', 'People_household', 'Total_Purchases'],
      dtype='object')
In [354]:
# Diviser les données
X = df_camp.drop(coldel1, axis=1)
y = df_camp['Response']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# SMOTE pour équilibrer les classes
#smote = SMOTE(random_state=42)
#X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Utilisation de ADASYN à la place de SMOTE
adasyn = ADASYN(random_state=42)
X_resampled, y_resampled = adasyn.fit_resample(X_train, y_train)

# Calculer le ratio des classes
ratio_of_classes = len(y_resampled) / sum(y_resampled == 1)
In [356]:
# Définir la grille de recherche pour XGBoost
param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1],
    'scale_pos_weight': [ratio_of_classes, 1],  # Essayer différents poids
    'colsample_bytree': [0.7, 1.0],
    'subsample': [0.8, 1.0]
}

# Définir la grille de recherche pour Random Forest
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'class_weight': ['balanced', 'balanced_subsample']
}

# Appliquer GridSearchCV pour XGBoost
grid_search_xgb = GridSearchCV(XGBClassifier(random_state=42), param_grid_xgb, scoring='f1', cv=5)
grid_search_xgb.fit(X_resampled, y_resampled)
best_xgb = grid_search_xgb.best_estimator_
print("Best parameters:", grid_search_xgb.best_params_)

# Appliquer GridSearchCV pour RandomForest
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, scoring='f1', cv=5)
grid_search_rf.fit(X_resampled, y_resampled)
best_rf = grid_search_rf.best_estimator_
print("Best parameters:", grid_search_rf.best_params_)

# Créer un modèle d'ensemble avec les meilleurs modèles
model_Camp6 = VotingClassifier(estimators=[('rf', best_rf), ('xgb', best_xgb)], voting='soft')
model_Camp6.fit(X_resampled, y_resampled)

# Prédictions et évaluation
y_pred = model_Camp6.predict(X_test)
print(classification_report(y_test, y_pred))

# Calcul du F1-score et MCC
f1 = f1_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)

print(f"F1-score: {f1}")
print(f"MCC: {mcc}")
Best parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 100, 'scale_pos_weight': 1.9829172141918527, 'subsample': 0.8}
Best parameters: {'class_weight': 'balanced_subsample', 'max_depth': None, 'n_estimators': 200}
              precision    recall  f1-score   support

         0.0       0.92      0.92      0.92       373
         1.0       0.55      0.56      0.55        68

    accuracy                           0.86       441
   macro avg       0.74      0.74      0.74       441
weighted avg       0.86      0.86      0.86       441

F1-score: 0.5547445255474452
MCC: 0.4728881523425176
In [358]:
from sklearn.metrics import auc
# Calcul de la courbe ROC et AUC
fpr, tpr, thresholds = roc_curve(y_test, model_Camp6.predict_proba(X_test)[:,1])
roc_auc = auc(fpr, tpr)

# Affichage de la courbe ROC
plt.figure(figsize=(8, 4))
plt.plot(fpr, tpr, color='firebrick', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()
No description has been provided for this image
In [360]:
from joblib import dump, load
from sklearn.metrics import roc_curve, auc
# Sauvegarder du modèle
dump(model_Camp6, "model_Camp6.joblib")
# Chargement du modèle
model6 = load("model_Camp6.joblib")
model6
Out[360]:
VotingClassifier(estimators=[('rf',
                              RandomForestClassifier(class_weight='balanced_subsample',
                                                     n_estimators=200,
                                                     random_state=42)),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=0.7, device=None,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=None,
                                            feature_typ...
                                            grow_policy=None,
                                            importance_type=None,
                                            interaction_constraints=None,
                                            learning_rate=0.1, max_bin=None,
                                            max_cat_threshold=None,
                                            max_cat_to_onehot=None,
                                            max_delta_step=None, max_depth=10,
                                            max_leaves=None,
                                            min_child_weight=None, missing=nan,
                                            monotone_constraints=None,
                                            multi_strategy=None,
                                            n_estimators=100, n_jobs=None,
                                            num_parallel_tree=None,
                                            random_state=42, ...))],
                 voting='soft')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
VotingClassifier(estimators=[('rf',
                              RandomForestClassifier(class_weight='balanced_subsample',
                                                     n_estimators=200,
                                                     random_state=42)),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=0.7, device=None,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=None,
                                            feature_typ...
                                            grow_policy=None,
                                            importance_type=None,
                                            interaction_constraints=None,
                                            learning_rate=0.1, max_bin=None,
                                            max_cat_threshold=None,
                                            max_cat_to_onehot=None,
                                            max_delta_step=None, max_depth=10,
                                            max_leaves=None,
                                            min_child_weight=None, missing=nan,
                                            monotone_constraints=None,
                                            multi_strategy=None,
                                            n_estimators=100, n_jobs=None,
                                            num_parallel_tree=None,
                                            random_state=42, ...))],
                 voting='soft')
RandomForestClassifier(class_weight='balanced_subsample', n_estimators=200,
                       random_state=42)
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.7, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=10, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...)
In [362]:
X = df_camp.drop(coldel1, axis=1)
y_true = df_camp['Response']

# Effectuer les prédictions
y_pred = model6.predict(X)

# Calculer les métriques
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, zero_division=0)
recall = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)

print(accuracy, precision, recall, f1)
print(classification_report(y_true, y_pred))
print(df_camp['Response'].value_counts(normalize=True))
cm = confusion_matrix(y_true, y_pred)
print(cm)
auc = roc_auc_score(y_true, model6.predict_proba(X)[:, 1])
print(f"ROC-AUC: {auc}")
0.9677565849227975 0.8808139534883721 0.9099099099099099 0.8951255539143279
              precision    recall  f1-score   support

         0.0       0.98      0.98      0.98      1869
         1.0       0.88      0.91      0.90       333

    accuracy                           0.97      2202
   macro avg       0.93      0.94      0.94      2202
weighted avg       0.97      0.97      0.97      2202

Response
0.0    0.848774
1.0    0.151226
Name: proportion, dtype: float64
[[1828   41]
 [  30  303]]
ROC-AUC: 0.9874834706295381
Model backup
In [ ]:
from joblib import dump

# Save each model with a clear and explicit name.
dump(model_Camp1, "model_Camp1.joblib")
dump(model_Camp2, "model_Camp2.joblib")
dump(model_Camp3, "model_Camp3.joblib")
dump(model_Camp4, "model_Camp4.joblib")
dump(model_Camp5, "model_Camp5.joblib")
dump(model_Camp6, "model_Camp6.joblib")